LCOV - code coverage report
Current view: top level - kernel/events - core.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 282 5350 5.3 %
Date: 2021-04-22 12:43:58 Functions: 25 365 6.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0
       2             : /*
       3             :  * Performance events core code:
       4             :  *
       5             :  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
       6             :  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
       7             :  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
       8             :  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
       9             :  */
      10             : 
      11             : #include <linux/fs.h>
      12             : #include <linux/mm.h>
      13             : #include <linux/cpu.h>
      14             : #include <linux/smp.h>
      15             : #include <linux/idr.h>
      16             : #include <linux/file.h>
      17             : #include <linux/poll.h>
      18             : #include <linux/slab.h>
      19             : #include <linux/hash.h>
      20             : #include <linux/tick.h>
      21             : #include <linux/sysfs.h>
      22             : #include <linux/dcache.h>
      23             : #include <linux/percpu.h>
      24             : #include <linux/ptrace.h>
      25             : #include <linux/reboot.h>
      26             : #include <linux/vmstat.h>
      27             : #include <linux/device.h>
      28             : #include <linux/export.h>
      29             : #include <linux/vmalloc.h>
      30             : #include <linux/hardirq.h>
      31             : #include <linux/hugetlb.h>
      32             : #include <linux/rculist.h>
      33             : #include <linux/uaccess.h>
      34             : #include <linux/syscalls.h>
      35             : #include <linux/anon_inodes.h>
      36             : #include <linux/kernel_stat.h>
      37             : #include <linux/cgroup.h>
      38             : #include <linux/perf_event.h>
      39             : #include <linux/trace_events.h>
      40             : #include <linux/hw_breakpoint.h>
      41             : #include <linux/mm_types.h>
      42             : #include <linux/module.h>
      43             : #include <linux/mman.h>
      44             : #include <linux/compat.h>
      45             : #include <linux/bpf.h>
      46             : #include <linux/filter.h>
      47             : #include <linux/namei.h>
      48             : #include <linux/parser.h>
      49             : #include <linux/sched/clock.h>
      50             : #include <linux/sched/mm.h>
      51             : #include <linux/proc_ns.h>
      52             : #include <linux/mount.h>
      53             : #include <linux/min_heap.h>
      54             : #include <linux/highmem.h>
      55             : #include <linux/pgtable.h>
      56             : #include <linux/buildid.h>
      57             : 
      58             : #include "internal.h"
      59             : 
      60             : #include <asm/irq_regs.h>
      61             : 
      62             : typedef int (*remote_function_f)(void *);
      63             : 
      64             : struct remote_function_call {
      65             :         struct task_struct      *p;
      66             :         remote_function_f       func;
      67             :         void                    *info;
      68             :         int                     ret;
      69             : };
      70             : 
      71           0 : static void remote_function(void *data)
      72             : {
      73           0 :         struct remote_function_call *tfc = data;
      74           0 :         struct task_struct *p = tfc->p;
      75             : 
      76           0 :         if (p) {
      77             :                 /* -EAGAIN */
      78           0 :                 if (task_cpu(p) != smp_processor_id())
      79             :                         return;
      80             : 
      81             :                 /*
      82             :                  * Now that we're on right CPU with IRQs disabled, we can test
      83             :                  * if we hit the right task without races.
      84             :                  */
      85             : 
      86           0 :                 tfc->ret = -ESRCH; /* No such (running) process */
      87           0 :                 if (p != current)
      88             :                         return;
      89             :         }
      90             : 
      91           0 :         tfc->ret = tfc->func(tfc->info);
      92             : }
      93             : 
      94             : /**
      95             :  * task_function_call - call a function on the cpu on which a task runs
      96             :  * @p:          the task to evaluate
      97             :  * @func:       the function to be called
      98             :  * @info:       the function call argument
      99             :  *
     100             :  * Calls the function @func when the task is currently running. This might
     101             :  * be on the current CPU, which just calls the function directly.  This will
     102             :  * retry due to any failures in smp_call_function_single(), such as if the
     103             :  * task_cpu() goes offline concurrently.
     104             :  *
     105             :  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
     106             :  */
     107             : static int
     108           0 : task_function_call(struct task_struct *p, remote_function_f func, void *info)
     109             : {
     110           0 :         struct remote_function_call data = {
     111             :                 .p      = p,
     112             :                 .func   = func,
     113             :                 .info   = info,
     114             :                 .ret    = -EAGAIN,
     115             :         };
     116           0 :         int ret;
     117             : 
     118           0 :         for (;;) {
     119           0 :                 ret = smp_call_function_single(task_cpu(p), remote_function,
     120             :                                                &data, 1);
     121           0 :                 if (!ret)
     122           0 :                         ret = data.ret;
     123             : 
     124           0 :                 if (ret != -EAGAIN)
     125             :                         break;
     126             : 
     127           0 :                 cond_resched();
     128             :         }
     129             : 
     130           0 :         return ret;
     131             : }
     132             : 
     133             : /**
     134             :  * cpu_function_call - call a function on the cpu
     135             :  * @func:       the function to be called
     136             :  * @info:       the function call argument
     137             :  *
     138             :  * Calls the function @func on the remote cpu.
     139             :  *
     140             :  * returns: @func return value or -ENXIO when the cpu is offline
     141             :  */
     142           0 : static int cpu_function_call(int cpu, remote_function_f func, void *info)
     143             : {
     144           0 :         struct remote_function_call data = {
     145             :                 .p      = NULL,
     146             :                 .func   = func,
     147             :                 .info   = info,
     148             :                 .ret    = -ENXIO, /* No such CPU */
     149             :         };
     150             : 
     151           0 :         smp_call_function_single(cpu, remote_function, &data, 1);
     152             : 
     153           0 :         return data.ret;
     154             : }
     155             : 
     156             : static inline struct perf_cpu_context *
     157           0 : __get_cpu_context(struct perf_event_context *ctx)
     158             : {
     159           0 :         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
     160             : }
     161             : 
     162           0 : static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
     163             :                           struct perf_event_context *ctx)
     164             : {
     165           0 :         raw_spin_lock(&cpuctx->ctx.lock);
     166           0 :         if (ctx)
     167           0 :                 raw_spin_lock(&ctx->lock);
     168           0 : }
     169             : 
     170           0 : static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
     171             :                             struct perf_event_context *ctx)
     172             : {
     173           0 :         if (ctx)
     174           0 :                 raw_spin_unlock(&ctx->lock);
     175           0 :         raw_spin_unlock(&cpuctx->ctx.lock);
     176           0 : }
     177             : 
     178             : #define TASK_TOMBSTONE ((void *)-1L)
     179             : 
     180           0 : static bool is_kernel_event(struct perf_event *event)
     181             : {
     182           0 :         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
     183             : }
     184             : 
     185             : /*
     186             :  * On task ctx scheduling...
     187             :  *
     188             :  * When !ctx->nr_events a task context will not be scheduled. This means
     189             :  * we can disable the scheduler hooks (for performance) without leaving
     190             :  * pending task ctx state.
     191             :  *
     192             :  * This however results in two special cases:
     193             :  *
     194             :  *  - removing the last event from a task ctx; this is relatively straight
     195             :  *    forward and is done in __perf_remove_from_context.
     196             :  *
     197             :  *  - adding the first event to a task ctx; this is tricky because we cannot
     198             :  *    rely on ctx->is_active and therefore cannot use event_function_call().
     199             :  *    See perf_install_in_context().
     200             :  *
     201             :  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
     202             :  */
     203             : 
     204             : typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
     205             :                         struct perf_event_context *, void *);
     206             : 
     207             : struct event_function_struct {
     208             :         struct perf_event *event;
     209             :         event_f func;
     210             :         void *data;
     211             : };
     212             : 
     213           0 : static int event_function(void *info)
     214             : {
     215           0 :         struct event_function_struct *efs = info;
     216           0 :         struct perf_event *event = efs->event;
     217           0 :         struct perf_event_context *ctx = event->ctx;
     218           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
     219           0 :         struct perf_event_context *task_ctx = cpuctx->task_ctx;
     220           0 :         int ret = 0;
     221             : 
     222           0 :         lockdep_assert_irqs_disabled();
     223             : 
     224           0 :         perf_ctx_lock(cpuctx, task_ctx);
     225             :         /*
     226             :          * Since we do the IPI call without holding ctx->lock things can have
     227             :          * changed, double check we hit the task we set out to hit.
     228             :          */
     229           0 :         if (ctx->task) {
     230           0 :                 if (ctx->task != current) {
     231           0 :                         ret = -ESRCH;
     232           0 :                         goto unlock;
     233             :                 }
     234             : 
     235             :                 /*
     236             :                  * We only use event_function_call() on established contexts,
     237             :                  * and event_function() is only ever called when active (or
     238             :                  * rather, we'll have bailed in task_function_call() or the
     239             :                  * above ctx->task != current test), therefore we must have
     240             :                  * ctx->is_active here.
     241             :                  */
     242           0 :                 WARN_ON_ONCE(!ctx->is_active);
     243             :                 /*
     244             :                  * And since we have ctx->is_active, cpuctx->task_ctx must
     245             :                  * match.
     246             :                  */
     247           0 :                 WARN_ON_ONCE(task_ctx != ctx);
     248             :         } else {
     249           0 :                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
     250             :         }
     251             : 
     252           0 :         efs->func(event, cpuctx, ctx, efs->data);
     253           0 : unlock:
     254           0 :         perf_ctx_unlock(cpuctx, task_ctx);
     255             : 
     256           0 :         return ret;
     257             : }
     258             : 
     259           0 : static void event_function_call(struct perf_event *event, event_f func, void *data)
     260             : {
     261           0 :         struct perf_event_context *ctx = event->ctx;
     262           0 :         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
     263           0 :         struct event_function_struct efs = {
     264             :                 .event = event,
     265             :                 .func = func,
     266             :                 .data = data,
     267             :         };
     268             : 
     269           0 :         if (!event->parent) {
     270             :                 /*
     271             :                  * If this is a !child event, we must hold ctx::mutex to
     272             :                  * stabilize the event->ctx relation. See
     273             :                  * perf_event_ctx_lock().
     274             :                  */
     275           0 :                 lockdep_assert_held(&ctx->mutex);
     276             :         }
     277             : 
     278           0 :         if (!task) {
     279           0 :                 cpu_function_call(event->cpu, event_function, &efs);
     280           0 :                 return;
     281             :         }
     282             : 
     283           0 :         if (task == TASK_TOMBSTONE)
     284             :                 return;
     285             : 
     286           0 : again:
     287           0 :         if (!task_function_call(task, event_function, &efs))
     288             :                 return;
     289             : 
     290           0 :         raw_spin_lock_irq(&ctx->lock);
     291             :         /*
     292             :          * Reload the task pointer, it might have been changed by
     293             :          * a concurrent perf_event_context_sched_out().
     294             :          */
     295           0 :         task = ctx->task;
     296           0 :         if (task == TASK_TOMBSTONE) {
     297           0 :                 raw_spin_unlock_irq(&ctx->lock);
     298           0 :                 return;
     299             :         }
     300           0 :         if (ctx->is_active) {
     301           0 :                 raw_spin_unlock_irq(&ctx->lock);
     302           0 :                 goto again;
     303             :         }
     304           0 :         func(event, NULL, ctx, data);
     305           0 :         raw_spin_unlock_irq(&ctx->lock);
     306             : }
     307             : 
     308             : /*
     309             :  * Similar to event_function_call() + event_function(), but hard assumes IRQs
     310             :  * are already disabled and we're on the right CPU.
     311             :  */
     312           0 : static void event_function_local(struct perf_event *event, event_f func, void *data)
     313             : {
     314           0 :         struct perf_event_context *ctx = event->ctx;
     315           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
     316           0 :         struct task_struct *task = READ_ONCE(ctx->task);
     317           0 :         struct perf_event_context *task_ctx = NULL;
     318             : 
     319           0 :         lockdep_assert_irqs_disabled();
     320             : 
     321           0 :         if (task) {
     322           0 :                 if (task == TASK_TOMBSTONE)
     323             :                         return;
     324             : 
     325             :                 task_ctx = ctx;
     326             :         }
     327             : 
     328           0 :         perf_ctx_lock(cpuctx, task_ctx);
     329             : 
     330           0 :         task = ctx->task;
     331           0 :         if (task == TASK_TOMBSTONE)
     332           0 :                 goto unlock;
     333             : 
     334           0 :         if (task) {
     335             :                 /*
     336             :                  * We must be either inactive or active and the right task,
     337             :                  * otherwise we're screwed, since we cannot IPI to somewhere
     338             :                  * else.
     339             :                  */
     340           0 :                 if (ctx->is_active) {
     341           0 :                         if (WARN_ON_ONCE(task != current))
     342           0 :                                 goto unlock;
     343             : 
     344           0 :                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
     345           0 :                                 goto unlock;
     346             :                 }
     347             :         } else {
     348           0 :                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
     349             :         }
     350             : 
     351           0 :         func(event, cpuctx, ctx, data);
     352           0 : unlock:
     353           0 :         perf_ctx_unlock(cpuctx, task_ctx);
     354             : }
     355             : 
     356             : #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
     357             :                        PERF_FLAG_FD_OUTPUT  |\
     358             :                        PERF_FLAG_PID_CGROUP |\
     359             :                        PERF_FLAG_FD_CLOEXEC)
     360             : 
     361             : /*
     362             :  * branch priv levels that need permission checks
     363             :  */
     364             : #define PERF_SAMPLE_BRANCH_PERM_PLM \
     365             :         (PERF_SAMPLE_BRANCH_KERNEL |\
     366             :          PERF_SAMPLE_BRANCH_HV)
     367             : 
     368             : enum event_type_t {
     369             :         EVENT_FLEXIBLE = 0x1,
     370             :         EVENT_PINNED = 0x2,
     371             :         EVENT_TIME = 0x4,
     372             :         /* see ctx_resched() for details */
     373             :         EVENT_CPU = 0x8,
     374             :         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
     375             : };
     376             : 
     377             : /*
     378             :  * perf_sched_events : >0 events exist
     379             :  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
     380             :  */
     381             : 
     382             : static void perf_sched_delayed(struct work_struct *work);
     383             : DEFINE_STATIC_KEY_FALSE(perf_sched_events);
     384             : static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
     385             : static DEFINE_MUTEX(perf_sched_mutex);
     386             : static atomic_t perf_sched_count;
     387             : 
     388             : static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
     389             : static DEFINE_PER_CPU(int, perf_sched_cb_usages);
     390             : static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
     391             : 
     392             : static atomic_t nr_mmap_events __read_mostly;
     393             : static atomic_t nr_comm_events __read_mostly;
     394             : static atomic_t nr_namespaces_events __read_mostly;
     395             : static atomic_t nr_task_events __read_mostly;
     396             : static atomic_t nr_freq_events __read_mostly;
     397             : static atomic_t nr_switch_events __read_mostly;
     398             : static atomic_t nr_ksymbol_events __read_mostly;
     399             : static atomic_t nr_bpf_events __read_mostly;
     400             : static atomic_t nr_cgroup_events __read_mostly;
     401             : static atomic_t nr_text_poke_events __read_mostly;
     402             : static atomic_t nr_build_id_events __read_mostly;
     403             : 
     404             : static LIST_HEAD(pmus);
     405             : static DEFINE_MUTEX(pmus_lock);
     406             : static struct srcu_struct pmus_srcu;
     407             : static cpumask_var_t perf_online_mask;
     408             : 
     409             : /*
     410             :  * perf event paranoia level:
     411             :  *  -1 - not paranoid at all
     412             :  *   0 - disallow raw tracepoint access for unpriv
     413             :  *   1 - disallow cpu events for unpriv
     414             :  *   2 - disallow kernel profiling for unpriv
     415             :  */
     416             : int sysctl_perf_event_paranoid __read_mostly = 2;
     417             : 
     418             : /* Minimum for 512 kiB + 1 user control page */
     419             : int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
     420             : 
     421             : /*
     422             :  * max perf event sample rate
     423             :  */
     424             : #define DEFAULT_MAX_SAMPLE_RATE         100000
     425             : #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
     426             : #define DEFAULT_CPU_TIME_MAX_PERCENT    25
     427             : 
     428             : int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
     429             : 
     430             : static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
     431             : static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
     432             : 
     433             : static int perf_sample_allowed_ns __read_mostly =
     434             :         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
     435             : 
     436           0 : static void update_perf_cpu_limits(void)
     437             : {
     438           0 :         u64 tmp = perf_sample_period_ns;
     439             : 
     440           0 :         tmp *= sysctl_perf_cpu_time_max_percent;
     441           0 :         tmp = div_u64(tmp, 100);
     442           0 :         if (!tmp)
     443           0 :                 tmp = 1;
     444             : 
     445           0 :         WRITE_ONCE(perf_sample_allowed_ns, tmp);
     446           0 : }
     447             : 
     448             : static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
     449             : 
     450           0 : int perf_proc_update_handler(struct ctl_table *table, int write,
     451             :                 void *buffer, size_t *lenp, loff_t *ppos)
     452             : {
     453           0 :         int ret;
     454           0 :         int perf_cpu = sysctl_perf_cpu_time_max_percent;
     455             :         /*
     456             :          * If throttling is disabled don't allow the write:
     457             :          */
     458           0 :         if (write && (perf_cpu == 100 || perf_cpu == 0))
     459             :                 return -EINVAL;
     460             : 
     461           0 :         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
     462           0 :         if (ret || !write)
     463             :                 return ret;
     464             : 
     465           0 :         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
     466           0 :         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
     467           0 :         update_perf_cpu_limits();
     468             : 
     469           0 :         return 0;
     470             : }
     471             : 
     472             : int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
     473             : 
     474           0 : int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
     475             :                 void *buffer, size_t *lenp, loff_t *ppos)
     476             : {
     477           0 :         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
     478             : 
     479           0 :         if (ret || !write)
     480             :                 return ret;
     481             : 
     482           0 :         if (sysctl_perf_cpu_time_max_percent == 100 ||
     483             :             sysctl_perf_cpu_time_max_percent == 0) {
     484           0 :                 printk(KERN_WARNING
     485             :                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
     486           0 :                 WRITE_ONCE(perf_sample_allowed_ns, 0);
     487             :         } else {
     488           0 :                 update_perf_cpu_limits();
     489             :         }
     490             : 
     491             :         return 0;
     492             : }
     493             : 
     494             : /*
     495             :  * perf samples are done in some very critical code paths (NMIs).
     496             :  * If they take too much CPU time, the system can lock up and not
     497             :  * get any real work done.  This will drop the sample rate when
     498             :  * we detect that events are taking too long.
     499             :  */
     500             : #define NR_ACCUMULATED_SAMPLES 128
     501             : static DEFINE_PER_CPU(u64, running_sample_length);
     502             : 
     503             : static u64 __report_avg;
     504             : static u64 __report_allowed;
     505             : 
     506           0 : static void perf_duration_warn(struct irq_work *w)
     507             : {
     508           0 :         printk_ratelimited(KERN_INFO
     509             :                 "perf: interrupt took too long (%lld > %lld), lowering "
     510             :                 "kernel.perf_event_max_sample_rate to %d\n",
     511             :                 __report_avg, __report_allowed,
     512             :                 sysctl_perf_event_sample_rate);
     513           0 : }
     514             : 
     515             : static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
     516             : 
     517           0 : void perf_sample_event_took(u64 sample_len_ns)
     518             : {
     519           0 :         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
     520           0 :         u64 running_len;
     521           0 :         u64 avg_len;
     522           0 :         u32 max;
     523             : 
     524           0 :         if (max_len == 0)
     525             :                 return;
     526             : 
     527             :         /* Decay the counter by 1 average sample. */
     528           0 :         running_len = __this_cpu_read(running_sample_length);
     529           0 :         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
     530           0 :         running_len += sample_len_ns;
     531           0 :         __this_cpu_write(running_sample_length, running_len);
     532             : 
     533             :         /*
     534             :          * Note: this will be biased artifically low until we have
     535             :          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
     536             :          * from having to maintain a count.
     537             :          */
     538           0 :         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
     539           0 :         if (avg_len <= max_len)
     540             :                 return;
     541             : 
     542           0 :         __report_avg = avg_len;
     543           0 :         __report_allowed = max_len;
     544             : 
     545             :         /*
     546             :          * Compute a throttle threshold 25% below the current duration.
     547             :          */
     548           0 :         avg_len += avg_len / 4;
     549           0 :         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
     550           0 :         if (avg_len < max)
     551           0 :                 max /= (u32)avg_len;
     552             :         else
     553             :                 max = 1;
     554             : 
     555           0 :         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
     556           0 :         WRITE_ONCE(max_samples_per_tick, max);
     557             : 
     558           0 :         sysctl_perf_event_sample_rate = max * HZ;
     559           0 :         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
     560             : 
     561           0 :         if (!irq_work_queue(&perf_duration_work)) {
     562           0 :                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
     563             :                              "kernel.perf_event_max_sample_rate to %d\n",
     564             :                              __report_avg, __report_allowed,
     565             :                              sysctl_perf_event_sample_rate);
     566             :         }
     567             : }
     568             : 
     569             : static atomic64_t perf_event_id;
     570             : 
     571             : static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
     572             :                               enum event_type_t event_type);
     573             : 
     574             : static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
     575             :                              enum event_type_t event_type,
     576             :                              struct task_struct *task);
     577             : 
     578             : static void update_context_time(struct perf_event_context *ctx);
     579             : static u64 perf_event_time(struct perf_event *event);
     580             : 
     581           0 : void __weak perf_event_print_debug(void)        { }
     582             : 
     583           0 : extern __weak const char *perf_pmu_name(void)
     584             : {
     585           0 :         return "pmu";
     586             : }
     587             : 
     588           0 : static inline u64 perf_clock(void)
     589             : {
     590           0 :         return local_clock();
     591             : }
     592             : 
     593           0 : static inline u64 perf_event_clock(struct perf_event *event)
     594             : {
     595           0 :         return event->clock();
     596             : }
     597             : 
     598             : /*
     599             :  * State based event timekeeping...
     600             :  *
     601             :  * The basic idea is to use event->state to determine which (if any) time
     602             :  * fields to increment with the current delta. This means we only need to
     603             :  * update timestamps when we change state or when they are explicitly requested
     604             :  * (read).
     605             :  *
     606             :  * Event groups make things a little more complicated, but not terribly so. The
     607             :  * rules for a group are that if the group leader is OFF the entire group is
     608             :  * OFF, irrespecive of what the group member states are. This results in
     609             :  * __perf_effective_state().
     610             :  *
     611             :  * A futher ramification is that when a group leader flips between OFF and
     612             :  * !OFF, we need to update all group member times.
     613             :  *
     614             :  *
     615             :  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
     616             :  * need to make sure the relevant context time is updated before we try and
     617             :  * update our timestamps.
     618             :  */
     619             : 
     620             : static __always_inline enum perf_event_state
     621           0 : __perf_effective_state(struct perf_event *event)
     622             : {
     623           0 :         struct perf_event *leader = event->group_leader;
     624             : 
     625           0 :         if (leader->state <= PERF_EVENT_STATE_OFF)
     626             :                 return leader->state;
     627             : 
     628           0 :         return event->state;
     629             : }
     630             : 
     631             : static __always_inline void
     632           0 : __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
     633             : {
     634           0 :         enum perf_event_state state = __perf_effective_state(event);
     635           0 :         u64 delta = now - event->tstamp;
     636             : 
     637           0 :         *enabled = event->total_time_enabled;
     638           0 :         if (state >= PERF_EVENT_STATE_INACTIVE)
     639           0 :                 *enabled += delta;
     640             : 
     641           0 :         *running = event->total_time_running;
     642           0 :         if (state >= PERF_EVENT_STATE_ACTIVE)
     643           0 :                 *running += delta;
     644             : }
     645             : 
     646           0 : static void perf_event_update_time(struct perf_event *event)
     647             : {
     648           0 :         u64 now = perf_event_time(event);
     649             : 
     650           0 :         __perf_update_times(event, now, &event->total_time_enabled,
     651             :                                         &event->total_time_running);
     652           0 :         event->tstamp = now;
     653           0 : }
     654             : 
     655           0 : static void perf_event_update_sibling_time(struct perf_event *leader)
     656             : {
     657           0 :         struct perf_event *sibling;
     658             : 
     659           0 :         for_each_sibling_event(sibling, leader)
     660           0 :                 perf_event_update_time(sibling);
     661           0 : }
     662             : 
     663             : static void
     664           0 : perf_event_set_state(struct perf_event *event, enum perf_event_state state)
     665             : {
     666           0 :         if (event->state == state)
     667             :                 return;
     668             : 
     669           0 :         perf_event_update_time(event);
     670             :         /*
     671             :          * If a group leader gets enabled/disabled all its siblings
     672             :          * are affected too.
     673             :          */
     674           0 :         if ((event->state < 0) ^ (state < 0))
     675           0 :                 perf_event_update_sibling_time(event);
     676             : 
     677           0 :         WRITE_ONCE(event->state, state);
     678             : }
     679             : 
     680             : #ifdef CONFIG_CGROUP_PERF
     681             : 
     682             : static inline bool
     683             : perf_cgroup_match(struct perf_event *event)
     684             : {
     685             :         struct perf_event_context *ctx = event->ctx;
     686             :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
     687             : 
     688             :         /* @event doesn't care about cgroup */
     689             :         if (!event->cgrp)
     690             :                 return true;
     691             : 
     692             :         /* wants specific cgroup scope but @cpuctx isn't associated with any */
     693             :         if (!cpuctx->cgrp)
     694             :                 return false;
     695             : 
     696             :         /*
     697             :          * Cgroup scoping is recursive.  An event enabled for a cgroup is
     698             :          * also enabled for all its descendant cgroups.  If @cpuctx's
     699             :          * cgroup is a descendant of @event's (the test covers identity
     700             :          * case), it's a match.
     701             :          */
     702             :         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
     703             :                                     event->cgrp->css.cgroup);
     704             : }
     705             : 
     706             : static inline void perf_detach_cgroup(struct perf_event *event)
     707             : {
     708             :         css_put(&event->cgrp->css);
     709             :         event->cgrp = NULL;
     710             : }
     711             : 
     712             : static inline int is_cgroup_event(struct perf_event *event)
     713             : {
     714             :         return event->cgrp != NULL;
     715             : }
     716             : 
     717             : static inline u64 perf_cgroup_event_time(struct perf_event *event)
     718             : {
     719             :         struct perf_cgroup_info *t;
     720             : 
     721             :         t = per_cpu_ptr(event->cgrp->info, event->cpu);
     722             :         return t->time;
     723             : }
     724             : 
     725             : static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
     726             : {
     727             :         struct perf_cgroup_info *info;
     728             :         u64 now;
     729             : 
     730             :         now = perf_clock();
     731             : 
     732             :         info = this_cpu_ptr(cgrp->info);
     733             : 
     734             :         info->time += now - info->timestamp;
     735             :         info->timestamp = now;
     736             : }
     737             : 
     738             : static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
     739             : {
     740             :         struct perf_cgroup *cgrp = cpuctx->cgrp;
     741             :         struct cgroup_subsys_state *css;
     742             : 
     743             :         if (cgrp) {
     744             :                 for (css = &cgrp->css; css; css = css->parent) {
     745             :                         cgrp = container_of(css, struct perf_cgroup, css);
     746             :                         __update_cgrp_time(cgrp);
     747             :                 }
     748             :         }
     749             : }
     750             : 
     751             : static inline void update_cgrp_time_from_event(struct perf_event *event)
     752             : {
     753             :         struct perf_cgroup *cgrp;
     754             : 
     755             :         /*
     756             :          * ensure we access cgroup data only when needed and
     757             :          * when we know the cgroup is pinned (css_get)
     758             :          */
     759             :         if (!is_cgroup_event(event))
     760             :                 return;
     761             : 
     762             :         cgrp = perf_cgroup_from_task(current, event->ctx);
     763             :         /*
     764             :          * Do not update time when cgroup is not active
     765             :          */
     766             :         if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
     767             :                 __update_cgrp_time(event->cgrp);
     768             : }
     769             : 
     770             : static inline void
     771             : perf_cgroup_set_timestamp(struct task_struct *task,
     772             :                           struct perf_event_context *ctx)
     773             : {
     774             :         struct perf_cgroup *cgrp;
     775             :         struct perf_cgroup_info *info;
     776             :         struct cgroup_subsys_state *css;
     777             : 
     778             :         /*
     779             :          * ctx->lock held by caller
     780             :          * ensure we do not access cgroup data
     781             :          * unless we have the cgroup pinned (css_get)
     782             :          */
     783             :         if (!task || !ctx->nr_cgroups)
     784             :                 return;
     785             : 
     786             :         cgrp = perf_cgroup_from_task(task, ctx);
     787             : 
     788             :         for (css = &cgrp->css; css; css = css->parent) {
     789             :                 cgrp = container_of(css, struct perf_cgroup, css);
     790             :                 info = this_cpu_ptr(cgrp->info);
     791             :                 info->timestamp = ctx->timestamp;
     792             :         }
     793             : }
     794             : 
     795             : static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
     796             : 
     797             : #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
     798             : #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
     799             : 
     800             : /*
     801             :  * reschedule events based on the cgroup constraint of task.
     802             :  *
     803             :  * mode SWOUT : schedule out everything
     804             :  * mode SWIN : schedule in based on cgroup for next
     805             :  */
     806             : static void perf_cgroup_switch(struct task_struct *task, int mode)
     807             : {
     808             :         struct perf_cpu_context *cpuctx;
     809             :         struct list_head *list;
     810             :         unsigned long flags;
     811             : 
     812             :         /*
     813             :          * Disable interrupts and preemption to avoid this CPU's
     814             :          * cgrp_cpuctx_entry to change under us.
     815             :          */
     816             :         local_irq_save(flags);
     817             : 
     818             :         list = this_cpu_ptr(&cgrp_cpuctx_list);
     819             :         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
     820             :                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
     821             : 
     822             :                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
     823             :                 perf_pmu_disable(cpuctx->ctx.pmu);
     824             : 
     825             :                 if (mode & PERF_CGROUP_SWOUT) {
     826             :                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
     827             :                         /*
     828             :                          * must not be done before ctxswout due
     829             :                          * to event_filter_match() in event_sched_out()
     830             :                          */
     831             :                         cpuctx->cgrp = NULL;
     832             :                 }
     833             : 
     834             :                 if (mode & PERF_CGROUP_SWIN) {
     835             :                         WARN_ON_ONCE(cpuctx->cgrp);
     836             :                         /*
     837             :                          * set cgrp before ctxsw in to allow
     838             :                          * event_filter_match() to not have to pass
     839             :                          * task around
     840             :                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
     841             :                          * because cgorup events are only per-cpu
     842             :                          */
     843             :                         cpuctx->cgrp = perf_cgroup_from_task(task,
     844             :                                                              &cpuctx->ctx);
     845             :                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
     846             :                 }
     847             :                 perf_pmu_enable(cpuctx->ctx.pmu);
     848             :                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
     849             :         }
     850             : 
     851             :         local_irq_restore(flags);
     852             : }
     853             : 
     854             : static inline void perf_cgroup_sched_out(struct task_struct *task,
     855             :                                          struct task_struct *next)
     856             : {
     857             :         struct perf_cgroup *cgrp1;
     858             :         struct perf_cgroup *cgrp2 = NULL;
     859             : 
     860             :         rcu_read_lock();
     861             :         /*
     862             :          * we come here when we know perf_cgroup_events > 0
     863             :          * we do not need to pass the ctx here because we know
     864             :          * we are holding the rcu lock
     865             :          */
     866             :         cgrp1 = perf_cgroup_from_task(task, NULL);
     867             :         cgrp2 = perf_cgroup_from_task(next, NULL);
     868             : 
     869             :         /*
     870             :          * only schedule out current cgroup events if we know
     871             :          * that we are switching to a different cgroup. Otherwise,
     872             :          * do no touch the cgroup events.
     873             :          */
     874             :         if (cgrp1 != cgrp2)
     875             :                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
     876             : 
     877             :         rcu_read_unlock();
     878             : }
     879             : 
     880             : static inline void perf_cgroup_sched_in(struct task_struct *prev,
     881             :                                         struct task_struct *task)
     882             : {
     883             :         struct perf_cgroup *cgrp1;
     884             :         struct perf_cgroup *cgrp2 = NULL;
     885             : 
     886             :         rcu_read_lock();
     887             :         /*
     888             :          * we come here when we know perf_cgroup_events > 0
     889             :          * we do not need to pass the ctx here because we know
     890             :          * we are holding the rcu lock
     891             :          */
     892             :         cgrp1 = perf_cgroup_from_task(task, NULL);
     893             :         cgrp2 = perf_cgroup_from_task(prev, NULL);
     894             : 
     895             :         /*
     896             :          * only need to schedule in cgroup events if we are changing
     897             :          * cgroup during ctxsw. Cgroup events were not scheduled
     898             :          * out of ctxsw out if that was not the case.
     899             :          */
     900             :         if (cgrp1 != cgrp2)
     901             :                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
     902             : 
     903             :         rcu_read_unlock();
     904             : }
     905             : 
     906             : static int perf_cgroup_ensure_storage(struct perf_event *event,
     907             :                                 struct cgroup_subsys_state *css)
     908             : {
     909             :         struct perf_cpu_context *cpuctx;
     910             :         struct perf_event **storage;
     911             :         int cpu, heap_size, ret = 0;
     912             : 
     913             :         /*
     914             :          * Allow storage to have sufficent space for an iterator for each
     915             :          * possibly nested cgroup plus an iterator for events with no cgroup.
     916             :          */
     917             :         for (heap_size = 1; css; css = css->parent)
     918             :                 heap_size++;
     919             : 
     920             :         for_each_possible_cpu(cpu) {
     921             :                 cpuctx = per_cpu_ptr(event->pmu->pmu_cpu_context, cpu);
     922             :                 if (heap_size <= cpuctx->heap_size)
     923             :                         continue;
     924             : 
     925             :                 storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
     926             :                                        GFP_KERNEL, cpu_to_node(cpu));
     927             :                 if (!storage) {
     928             :                         ret = -ENOMEM;
     929             :                         break;
     930             :                 }
     931             : 
     932             :                 raw_spin_lock_irq(&cpuctx->ctx.lock);
     933             :                 if (cpuctx->heap_size < heap_size) {
     934             :                         swap(cpuctx->heap, storage);
     935             :                         if (storage == cpuctx->heap_default)
     936             :                                 storage = NULL;
     937             :                         cpuctx->heap_size = heap_size;
     938             :                 }
     939             :                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
     940             : 
     941             :                 kfree(storage);
     942             :         }
     943             : 
     944             :         return ret;
     945             : }
     946             : 
     947             : static inline int perf_cgroup_connect(int fd, struct perf_event *event,
     948             :                                       struct perf_event_attr *attr,
     949             :                                       struct perf_event *group_leader)
     950             : {
     951             :         struct perf_cgroup *cgrp;
     952             :         struct cgroup_subsys_state *css;
     953             :         struct fd f = fdget(fd);
     954             :         int ret = 0;
     955             : 
     956             :         if (!f.file)
     957             :                 return -EBADF;
     958             : 
     959             :         css = css_tryget_online_from_dir(f.file->f_path.dentry,
     960             :                                          &perf_event_cgrp_subsys);
     961             :         if (IS_ERR(css)) {
     962             :                 ret = PTR_ERR(css);
     963             :                 goto out;
     964             :         }
     965             : 
     966             :         ret = perf_cgroup_ensure_storage(event, css);
     967             :         if (ret)
     968             :                 goto out;
     969             : 
     970             :         cgrp = container_of(css, struct perf_cgroup, css);
     971             :         event->cgrp = cgrp;
     972             : 
     973             :         /*
     974             :          * all events in a group must monitor
     975             :          * the same cgroup because a task belongs
     976             :          * to only one perf cgroup at a time
     977             :          */
     978             :         if (group_leader && group_leader->cgrp != cgrp) {
     979             :                 perf_detach_cgroup(event);
     980             :                 ret = -EINVAL;
     981             :         }
     982             : out:
     983             :         fdput(f);
     984             :         return ret;
     985             : }
     986             : 
     987             : static inline void
     988             : perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
     989             : {
     990             :         struct perf_cgroup_info *t;
     991             :         t = per_cpu_ptr(event->cgrp->info, event->cpu);
     992             :         event->shadow_ctx_time = now - t->timestamp;
     993             : }
     994             : 
     995             : static inline void
     996             : perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
     997             : {
     998             :         struct perf_cpu_context *cpuctx;
     999             : 
    1000             :         if (!is_cgroup_event(event))
    1001             :                 return;
    1002             : 
    1003             :         /*
    1004             :          * Because cgroup events are always per-cpu events,
    1005             :          * @ctx == &cpuctx->ctx.
    1006             :          */
    1007             :         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
    1008             : 
    1009             :         /*
    1010             :          * Since setting cpuctx->cgrp is conditional on the current @cgrp
    1011             :          * matching the event's cgroup, we must do this for every new event,
    1012             :          * because if the first would mismatch, the second would not try again
    1013             :          * and we would leave cpuctx->cgrp unset.
    1014             :          */
    1015             :         if (ctx->is_active && !cpuctx->cgrp) {
    1016             :                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
    1017             : 
    1018             :                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
    1019             :                         cpuctx->cgrp = cgrp;
    1020             :         }
    1021             : 
    1022             :         if (ctx->nr_cgroups++)
    1023             :                 return;
    1024             : 
    1025             :         list_add(&cpuctx->cgrp_cpuctx_entry,
    1026             :                         per_cpu_ptr(&cgrp_cpuctx_list, event->cpu));
    1027             : }
    1028             : 
    1029             : static inline void
    1030             : perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
    1031             : {
    1032             :         struct perf_cpu_context *cpuctx;
    1033             : 
    1034             :         if (!is_cgroup_event(event))
    1035             :                 return;
    1036             : 
    1037             :         /*
    1038             :          * Because cgroup events are always per-cpu events,
    1039             :          * @ctx == &cpuctx->ctx.
    1040             :          */
    1041             :         cpuctx = container_of(ctx, struct perf_cpu_context, ctx);
    1042             : 
    1043             :         if (--ctx->nr_cgroups)
    1044             :                 return;
    1045             : 
    1046             :         if (ctx->is_active && cpuctx->cgrp)
    1047             :                 cpuctx->cgrp = NULL;
    1048             : 
    1049             :         list_del(&cpuctx->cgrp_cpuctx_entry);
    1050             : }
    1051             : 
    1052             : #else /* !CONFIG_CGROUP_PERF */
    1053             : 
    1054             : static inline bool
    1055           0 : perf_cgroup_match(struct perf_event *event)
    1056             : {
    1057           0 :         return true;
    1058             : }
    1059             : 
    1060             : static inline void perf_detach_cgroup(struct perf_event *event)
    1061             : {}
    1062             : 
    1063           0 : static inline int is_cgroup_event(struct perf_event *event)
    1064             : {
    1065           0 :         return 0;
    1066             : }
    1067             : 
    1068           0 : static inline void update_cgrp_time_from_event(struct perf_event *event)
    1069             : {
    1070           0 : }
    1071             : 
    1072           0 : static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
    1073             : {
    1074           0 : }
    1075             : 
    1076             : static inline void perf_cgroup_sched_out(struct task_struct *task,
    1077             :                                          struct task_struct *next)
    1078             : {
    1079             : }
    1080             : 
    1081             : static inline void perf_cgroup_sched_in(struct task_struct *prev,
    1082             :                                         struct task_struct *task)
    1083             : {
    1084             : }
    1085             : 
    1086           0 : static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
    1087             :                                       struct perf_event_attr *attr,
    1088             :                                       struct perf_event *group_leader)
    1089             : {
    1090           0 :         return -EINVAL;
    1091             : }
    1092             : 
    1093             : static inline void
    1094           0 : perf_cgroup_set_timestamp(struct task_struct *task,
    1095             :                           struct perf_event_context *ctx)
    1096             : {
    1097             : }
    1098             : 
    1099             : static inline void
    1100             : perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
    1101             : {
    1102             : }
    1103             : 
    1104             : static inline void
    1105             : perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
    1106             : {
    1107             : }
    1108             : 
    1109             : static inline u64 perf_cgroup_event_time(struct perf_event *event)
    1110             : {
    1111             :         return 0;
    1112             : }
    1113             : 
    1114             : static inline void
    1115           0 : perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
    1116             : {
    1117           0 : }
    1118             : 
    1119             : static inline void
    1120           0 : perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
    1121             : {
    1122           0 : }
    1123             : #endif
    1124             : 
    1125             : /*
    1126             :  * set default to be dependent on timer tick just
    1127             :  * like original code
    1128             :  */
    1129             : #define PERF_CPU_HRTIMER (1000 / HZ)
    1130             : /*
    1131             :  * function must be called with interrupts disabled
    1132             :  */
    1133           0 : static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
    1134             : {
    1135           0 :         struct perf_cpu_context *cpuctx;
    1136           0 :         bool rotations;
    1137             : 
    1138           0 :         lockdep_assert_irqs_disabled();
    1139             : 
    1140           0 :         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
    1141           0 :         rotations = perf_rotate_context(cpuctx);
    1142             : 
    1143           0 :         raw_spin_lock(&cpuctx->hrtimer_lock);
    1144           0 :         if (rotations)
    1145           0 :                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
    1146             :         else
    1147           0 :                 cpuctx->hrtimer_active = 0;
    1148           0 :         raw_spin_unlock(&cpuctx->hrtimer_lock);
    1149             : 
    1150           0 :         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
    1151             : }
    1152             : 
    1153           8 : static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
    1154             : {
    1155           8 :         struct hrtimer *timer = &cpuctx->hrtimer;
    1156           8 :         struct pmu *pmu = cpuctx->ctx.pmu;
    1157           8 :         u64 interval;
    1158             : 
    1159             :         /* no multiplexing needed for SW PMU */
    1160           8 :         if (pmu->task_ctx_nr == perf_sw_context)
    1161             :                 return;
    1162             : 
    1163             :         /*
    1164             :          * check default is sane, if not set then force to
    1165             :          * default interval (1/tick)
    1166             :          */
    1167           4 :         interval = pmu->hrtimer_interval_ms;
    1168           4 :         if (interval < 1)
    1169           1 :                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
    1170             : 
    1171           4 :         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
    1172             : 
    1173           4 :         raw_spin_lock_init(&cpuctx->hrtimer_lock);
    1174           4 :         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
    1175           4 :         timer->function = perf_mux_hrtimer_handler;
    1176             : }
    1177             : 
    1178           0 : static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
    1179             : {
    1180           0 :         struct hrtimer *timer = &cpuctx->hrtimer;
    1181           0 :         struct pmu *pmu = cpuctx->ctx.pmu;
    1182           0 :         unsigned long flags;
    1183             : 
    1184             :         /* not for SW PMU */
    1185           0 :         if (pmu->task_ctx_nr == perf_sw_context)
    1186             :                 return 0;
    1187             : 
    1188           0 :         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
    1189           0 :         if (!cpuctx->hrtimer_active) {
    1190           0 :                 cpuctx->hrtimer_active = 1;
    1191           0 :                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
    1192           0 :                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
    1193             :         }
    1194           0 :         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
    1195             : 
    1196           0 :         return 0;
    1197             : }
    1198             : 
    1199           0 : void perf_pmu_disable(struct pmu *pmu)
    1200             : {
    1201           0 :         int *count = this_cpu_ptr(pmu->pmu_disable_count);
    1202           0 :         if (!(*count)++)
    1203           0 :                 pmu->pmu_disable(pmu);
    1204           0 : }
    1205             : 
    1206           0 : void perf_pmu_enable(struct pmu *pmu)
    1207             : {
    1208           0 :         int *count = this_cpu_ptr(pmu->pmu_disable_count);
    1209           0 :         if (!--(*count))
    1210           0 :                 pmu->pmu_enable(pmu);
    1211           0 : }
    1212             : 
    1213             : static DEFINE_PER_CPU(struct list_head, active_ctx_list);
    1214             : 
    1215             : /*
    1216             :  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
    1217             :  * perf_event_task_tick() are fully serialized because they're strictly cpu
    1218             :  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
    1219             :  * disabled, while perf_event_task_tick is called from IRQ context.
    1220             :  */
    1221           0 : static void perf_event_ctx_activate(struct perf_event_context *ctx)
    1222             : {
    1223           0 :         struct list_head *head = this_cpu_ptr(&active_ctx_list);
    1224             : 
    1225           0 :         lockdep_assert_irqs_disabled();
    1226             : 
    1227           0 :         WARN_ON(!list_empty(&ctx->active_ctx_list));
    1228             : 
    1229           0 :         list_add(&ctx->active_ctx_list, head);
    1230           0 : }
    1231             : 
    1232           0 : static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
    1233             : {
    1234           0 :         lockdep_assert_irqs_disabled();
    1235             : 
    1236           0 :         WARN_ON(list_empty(&ctx->active_ctx_list));
    1237             : 
    1238           0 :         list_del_init(&ctx->active_ctx_list);
    1239           0 : }
    1240             : 
    1241           0 : static void get_ctx(struct perf_event_context *ctx)
    1242             : {
    1243           0 :         refcount_inc(&ctx->refcount);
    1244           0 : }
    1245             : 
    1246           0 : static void *alloc_task_ctx_data(struct pmu *pmu)
    1247             : {
    1248           0 :         if (pmu->task_ctx_cache)
    1249           0 :                 return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
    1250             : 
    1251             :         return NULL;
    1252             : }
    1253             : 
    1254           0 : static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
    1255             : {
    1256           0 :         if (pmu->task_ctx_cache && task_ctx_data)
    1257           0 :                 kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
    1258           0 : }
    1259             : 
    1260           0 : static void free_ctx(struct rcu_head *head)
    1261             : {
    1262           0 :         struct perf_event_context *ctx;
    1263             : 
    1264           0 :         ctx = container_of(head, struct perf_event_context, rcu_head);
    1265           0 :         free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
    1266           0 :         kfree(ctx);
    1267           0 : }
    1268             : 
    1269           0 : static void put_ctx(struct perf_event_context *ctx)
    1270             : {
    1271           0 :         if (refcount_dec_and_test(&ctx->refcount)) {
    1272           0 :                 if (ctx->parent_ctx)
    1273           0 :                         put_ctx(ctx->parent_ctx);
    1274           0 :                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
    1275           0 :                         put_task_struct(ctx->task);
    1276           0 :                 call_rcu(&ctx->rcu_head, free_ctx);
    1277             :         }
    1278           0 : }
    1279             : 
    1280             : /*
    1281             :  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
    1282             :  * perf_pmu_migrate_context() we need some magic.
    1283             :  *
    1284             :  * Those places that change perf_event::ctx will hold both
    1285             :  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
    1286             :  *
    1287             :  * Lock ordering is by mutex address. There are two other sites where
    1288             :  * perf_event_context::mutex nests and those are:
    1289             :  *
    1290             :  *  - perf_event_exit_task_context()    [ child , 0 ]
    1291             :  *      perf_event_exit_event()
    1292             :  *        put_event()                   [ parent, 1 ]
    1293             :  *
    1294             :  *  - perf_event_init_context()         [ parent, 0 ]
    1295             :  *      inherit_task_group()
    1296             :  *        inherit_group()
    1297             :  *          inherit_event()
    1298             :  *            perf_event_alloc()
    1299             :  *              perf_init_event()
    1300             :  *                perf_try_init_event() [ child , 1 ]
    1301             :  *
    1302             :  * While it appears there is an obvious deadlock here -- the parent and child
    1303             :  * nesting levels are inverted between the two. This is in fact safe because
    1304             :  * life-time rules separate them. That is an exiting task cannot fork, and a
    1305             :  * spawning task cannot (yet) exit.
    1306             :  *
    1307             :  * But remember that these are parent<->child context relations, and
    1308             :  * migration does not affect children, therefore these two orderings should not
    1309             :  * interact.
    1310             :  *
    1311             :  * The change in perf_event::ctx does not affect children (as claimed above)
    1312             :  * because the sys_perf_event_open() case will install a new event and break
    1313             :  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
    1314             :  * concerned with cpuctx and that doesn't have children.
    1315             :  *
    1316             :  * The places that change perf_event::ctx will issue:
    1317             :  *
    1318             :  *   perf_remove_from_context();
    1319             :  *   synchronize_rcu();
    1320             :  *   perf_install_in_context();
    1321             :  *
    1322             :  * to affect the change. The remove_from_context() + synchronize_rcu() should
    1323             :  * quiesce the event, after which we can install it in the new location. This
    1324             :  * means that only external vectors (perf_fops, prctl) can perturb the event
    1325             :  * while in transit. Therefore all such accessors should also acquire
    1326             :  * perf_event_context::mutex to serialize against this.
    1327             :  *
    1328             :  * However; because event->ctx can change while we're waiting to acquire
    1329             :  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
    1330             :  * function.
    1331             :  *
    1332             :  * Lock order:
    1333             :  *    exec_update_lock
    1334             :  *      task_struct::perf_event_mutex
    1335             :  *        perf_event_context::mutex
    1336             :  *          perf_event::child_mutex;
    1337             :  *            perf_event_context::lock
    1338             :  *          perf_event::mmap_mutex
    1339             :  *          mmap_lock
    1340             :  *            perf_addr_filters_head::lock
    1341             :  *
    1342             :  *    cpu_hotplug_lock
    1343             :  *      pmus_lock
    1344             :  *        cpuctx->mutex / perf_event_context::mutex
    1345             :  */
    1346             : static struct perf_event_context *
    1347           0 : perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
    1348             : {
    1349           0 :         struct perf_event_context *ctx;
    1350             : 
    1351             : again:
    1352           0 :         rcu_read_lock();
    1353           0 :         ctx = READ_ONCE(event->ctx);
    1354           0 :         if (!refcount_inc_not_zero(&ctx->refcount)) {
    1355           0 :                 rcu_read_unlock();
    1356           0 :                 goto again;
    1357             :         }
    1358           0 :         rcu_read_unlock();
    1359             : 
    1360           0 :         mutex_lock_nested(&ctx->mutex, nesting);
    1361           0 :         if (event->ctx != ctx) {
    1362           0 :                 mutex_unlock(&ctx->mutex);
    1363           0 :                 put_ctx(ctx);
    1364           0 :                 goto again;
    1365             :         }
    1366             : 
    1367           0 :         return ctx;
    1368             : }
    1369             : 
    1370             : static inline struct perf_event_context *
    1371           0 : perf_event_ctx_lock(struct perf_event *event)
    1372             : {
    1373           0 :         return perf_event_ctx_lock_nested(event, 0);
    1374             : }
    1375             : 
    1376           0 : static void perf_event_ctx_unlock(struct perf_event *event,
    1377             :                                   struct perf_event_context *ctx)
    1378             : {
    1379           0 :         mutex_unlock(&ctx->mutex);
    1380           0 :         put_ctx(ctx);
    1381           0 : }
    1382             : 
    1383             : /*
    1384             :  * This must be done under the ctx->lock, such as to serialize against
    1385             :  * context_equiv(), therefore we cannot call put_ctx() since that might end up
    1386             :  * calling scheduler related locks and ctx->lock nests inside those.
    1387             :  */
    1388             : static __must_check struct perf_event_context *
    1389           0 : unclone_ctx(struct perf_event_context *ctx)
    1390             : {
    1391           0 :         struct perf_event_context *parent_ctx = ctx->parent_ctx;
    1392             : 
    1393           0 :         lockdep_assert_held(&ctx->lock);
    1394             : 
    1395           0 :         if (parent_ctx)
    1396           0 :                 ctx->parent_ctx = NULL;
    1397           0 :         ctx->generation++;
    1398             : 
    1399           0 :         return parent_ctx;
    1400             : }
    1401             : 
    1402           0 : static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
    1403             :                                 enum pid_type type)
    1404             : {
    1405           0 :         u32 nr;
    1406             :         /*
    1407             :          * only top level events have the pid namespace they were created in
    1408             :          */
    1409           0 :         if (event->parent)
    1410           0 :                 event = event->parent;
    1411             : 
    1412           0 :         nr = __task_pid_nr_ns(p, type, event->ns);
    1413             :         /* avoid -1 if it is idle thread or runs in another ns */
    1414           0 :         if (!nr && !pid_alive(p))
    1415           0 :                 nr = -1;
    1416           0 :         return nr;
    1417             : }
    1418             : 
    1419           0 : static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
    1420             : {
    1421           0 :         return perf_event_pid_type(event, p, PIDTYPE_TGID);
    1422             : }
    1423             : 
    1424           0 : static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
    1425             : {
    1426           0 :         return perf_event_pid_type(event, p, PIDTYPE_PID);
    1427             : }
    1428             : 
    1429             : /*
    1430             :  * If we inherit events we want to return the parent event id
    1431             :  * to userspace.
    1432             :  */
    1433           0 : static u64 primary_event_id(struct perf_event *event)
    1434             : {
    1435           0 :         u64 id = event->id;
    1436             : 
    1437           0 :         if (event->parent)
    1438           0 :                 id = event->parent->id;
    1439             : 
    1440           0 :         return id;
    1441             : }
    1442             : 
    1443             : /*
    1444             :  * Get the perf_event_context for a task and lock it.
    1445             :  *
    1446             :  * This has to cope with the fact that until it is locked,
    1447             :  * the context could get moved to another task.
    1448             :  */
    1449             : static struct perf_event_context *
    1450        2310 : perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
    1451             : {
    1452        2310 :         struct perf_event_context *ctx;
    1453             : 
    1454        2310 : retry:
    1455             :         /*
    1456             :          * One of the few rules of preemptible RCU is that one cannot do
    1457             :          * rcu_read_unlock() while holding a scheduler (or nested) lock when
    1458             :          * part of the read side critical section was irqs-enabled -- see
    1459             :          * rcu_read_unlock_special().
    1460             :          *
    1461             :          * Since ctx->lock nests under rq->lock we must ensure the entire read
    1462             :          * side critical section has interrupts disabled.
    1463             :          */
    1464        4620 :         local_irq_save(*flags);
    1465        2310 :         rcu_read_lock();
    1466        2310 :         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
    1467        2310 :         if (ctx) {
    1468             :                 /*
    1469             :                  * If this context is a clone of another, it might
    1470             :                  * get swapped for another underneath us by
    1471             :                  * perf_event_task_sched_out, though the
    1472             :                  * rcu_read_lock() protects us from any context
    1473             :                  * getting freed.  Lock the context and check if it
    1474             :                  * got swapped before we could get the lock, and retry
    1475             :                  * if so.  If we locked the right context, then it
    1476             :                  * can't get swapped on us any more.
    1477             :                  */
    1478           0 :                 raw_spin_lock(&ctx->lock);
    1479           0 :                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
    1480           0 :                         raw_spin_unlock(&ctx->lock);
    1481           0 :                         rcu_read_unlock();
    1482           0 :                         local_irq_restore(*flags);
    1483           0 :                         goto retry;
    1484             :                 }
    1485             : 
    1486           0 :                 if (ctx->task == TASK_TOMBSTONE ||
    1487           0 :                     !refcount_inc_not_zero(&ctx->refcount)) {
    1488           0 :                         raw_spin_unlock(&ctx->lock);
    1489           0 :                         ctx = NULL;
    1490             :                 } else {
    1491           0 :                         WARN_ON_ONCE(ctx->task != task);
    1492             :                 }
    1493             :         }
    1494        2310 :         rcu_read_unlock();
    1495        2310 :         if (!ctx)
    1496        2310 :                 local_irq_restore(*flags);
    1497        2310 :         return ctx;
    1498             : }
    1499             : 
    1500             : /*
    1501             :  * Get the context for a task and increment its pin_count so it
    1502             :  * can't get swapped to another task.  This also increments its
    1503             :  * reference count so that the context can't get freed.
    1504             :  */
    1505             : static struct perf_event_context *
    1506        2310 : perf_pin_task_context(struct task_struct *task, int ctxn)
    1507             : {
    1508        2310 :         struct perf_event_context *ctx;
    1509        2310 :         unsigned long flags;
    1510             : 
    1511        2310 :         ctx = perf_lock_task_context(task, ctxn, &flags);
    1512        2310 :         if (ctx) {
    1513           0 :                 ++ctx->pin_count;
    1514           0 :                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
    1515             :         }
    1516        2310 :         return ctx;
    1517             : }
    1518             : 
    1519           0 : static void perf_unpin_context(struct perf_event_context *ctx)
    1520             : {
    1521           0 :         unsigned long flags;
    1522             : 
    1523           0 :         raw_spin_lock_irqsave(&ctx->lock, flags);
    1524           0 :         --ctx->pin_count;
    1525           0 :         raw_spin_unlock_irqrestore(&ctx->lock, flags);
    1526           0 : }
    1527             : 
    1528             : /*
    1529             :  * Update the record of the current time in a context.
    1530             :  */
    1531           0 : static void update_context_time(struct perf_event_context *ctx)
    1532             : {
    1533           0 :         u64 now = perf_clock();
    1534             : 
    1535           0 :         ctx->time += now - ctx->timestamp;
    1536           0 :         ctx->timestamp = now;
    1537           0 : }
    1538             : 
    1539           0 : static u64 perf_event_time(struct perf_event *event)
    1540             : {
    1541           0 :         struct perf_event_context *ctx = event->ctx;
    1542             : 
    1543           0 :         if (is_cgroup_event(event))
    1544             :                 return perf_cgroup_event_time(event);
    1545             : 
    1546           0 :         return ctx ? ctx->time : 0;
    1547             : }
    1548             : 
    1549           0 : static enum event_type_t get_event_type(struct perf_event *event)
    1550             : {
    1551           0 :         struct perf_event_context *ctx = event->ctx;
    1552           0 :         enum event_type_t event_type;
    1553             : 
    1554           0 :         lockdep_assert_held(&ctx->lock);
    1555             : 
    1556             :         /*
    1557             :          * It's 'group type', really, because if our group leader is
    1558             :          * pinned, so are we.
    1559             :          */
    1560           0 :         if (event->group_leader != event)
    1561           0 :                 event = event->group_leader;
    1562             : 
    1563           0 :         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
    1564           0 :         if (!ctx->task)
    1565           0 :                 event_type |= EVENT_CPU;
    1566             : 
    1567           0 :         return event_type;
    1568             : }
    1569             : 
    1570             : /*
    1571             :  * Helper function to initialize event group nodes.
    1572             :  */
    1573           0 : static void init_event_group(struct perf_event *event)
    1574             : {
    1575           0 :         RB_CLEAR_NODE(&event->group_node);
    1576           0 :         event->group_index = 0;
    1577             : }
    1578             : 
    1579             : /*
    1580             :  * Extract pinned or flexible groups from the context
    1581             :  * based on event attrs bits.
    1582             :  */
    1583             : static struct perf_event_groups *
    1584           0 : get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
    1585             : {
    1586           0 :         if (event->attr.pinned)
    1587           0 :                 return &ctx->pinned_groups;
    1588             :         else
    1589           0 :                 return &ctx->flexible_groups;
    1590             : }
    1591             : 
    1592             : /*
    1593             :  * Helper function to initializes perf_event_group trees.
    1594             :  */
    1595           8 : static void perf_event_groups_init(struct perf_event_groups *groups)
    1596             : {
    1597           8 :         groups->tree = RB_ROOT;
    1598           8 :         groups->index = 0;
    1599             : }
    1600             : 
    1601           0 : static inline struct cgroup *event_cgroup(const struct perf_event *event)
    1602             : {
    1603           0 :         struct cgroup *cgroup = NULL;
    1604             : 
    1605             : #ifdef CONFIG_CGROUP_PERF
    1606             :         if (event->cgrp)
    1607             :                 cgroup = event->cgrp->css.cgroup;
    1608             : #endif
    1609             : 
    1610           0 :         return cgroup;
    1611             : }
    1612             : 
    1613             : /*
    1614             :  * Compare function for event groups;
    1615             :  *
    1616             :  * Implements complex key that first sorts by CPU and then by virtual index
    1617             :  * which provides ordering when rotating groups for the same CPU.
    1618             :  */
    1619             : static __always_inline int
    1620           0 : perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup,
    1621             :                       const u64 left_group_index, const struct perf_event *right)
    1622             : {
    1623           0 :         if (left_cpu < right->cpu)
    1624             :                 return -1;
    1625           0 :         if (left_cpu > right->cpu)
    1626             :                 return 1;
    1627             : 
    1628             : #ifdef CONFIG_CGROUP_PERF
    1629             :         {
    1630             :                 const struct cgroup *right_cgroup = event_cgroup(right);
    1631             : 
    1632             :                 if (left_cgroup != right_cgroup) {
    1633             :                         if (!left_cgroup) {
    1634             :                                 /*
    1635             :                                  * Left has no cgroup but right does, no
    1636             :                                  * cgroups come first.
    1637             :                                  */
    1638             :                                 return -1;
    1639             :                         }
    1640             :                         if (!right_cgroup) {
    1641             :                                 /*
    1642             :                                  * Right has no cgroup but left does, no
    1643             :                                  * cgroups come first.
    1644             :                                  */
    1645             :                                 return 1;
    1646             :                         }
    1647             :                         /* Two dissimilar cgroups, order by id. */
    1648             :                         if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
    1649             :                                 return -1;
    1650             : 
    1651             :                         return 1;
    1652             :                 }
    1653             :         }
    1654             : #endif
    1655             : 
    1656           0 :         if (left_group_index < right->group_index)
    1657             :                 return -1;
    1658           0 :         if (left_group_index > right->group_index)
    1659           0 :                 return 1;
    1660             : 
    1661             :         return 0;
    1662             : }
    1663             : 
    1664             : #define __node_2_pe(node) \
    1665             :         rb_entry((node), struct perf_event, group_node)
    1666             : 
    1667           0 : static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
    1668             : {
    1669           0 :         struct perf_event *e = __node_2_pe(a);
    1670           0 :         return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index,
    1671           0 :                                      __node_2_pe(b)) < 0;
    1672             : }
    1673             : 
    1674             : struct __group_key {
    1675             :         int cpu;
    1676             :         struct cgroup *cgroup;
    1677             : };
    1678             : 
    1679           0 : static inline int __group_cmp(const void *key, const struct rb_node *node)
    1680             : {
    1681           0 :         const struct __group_key *a = key;
    1682           0 :         const struct perf_event *b = __node_2_pe(node);
    1683             : 
    1684             :         /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */
    1685           0 :         return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b);
    1686             : }
    1687             : 
    1688             : /*
    1689             :  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
    1690             :  * key (see perf_event_groups_less). This places it last inside the CPU
    1691             :  * subtree.
    1692             :  */
    1693             : static void
    1694           0 : perf_event_groups_insert(struct perf_event_groups *groups,
    1695             :                          struct perf_event *event)
    1696             : {
    1697           0 :         event->group_index = ++groups->index;
    1698             : 
    1699           0 :         rb_add(&event->group_node, &groups->tree, __group_less);
    1700           0 : }
    1701             : 
    1702             : /*
    1703             :  * Helper function to insert event into the pinned or flexible groups.
    1704             :  */
    1705             : static void
    1706           0 : add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
    1707             : {
    1708           0 :         struct perf_event_groups *groups;
    1709             : 
    1710           0 :         groups = get_event_groups(event, ctx);
    1711           0 :         perf_event_groups_insert(groups, event);
    1712           0 : }
    1713             : 
    1714             : /*
    1715             :  * Delete a group from a tree.
    1716             :  */
    1717             : static void
    1718           0 : perf_event_groups_delete(struct perf_event_groups *groups,
    1719             :                          struct perf_event *event)
    1720             : {
    1721           0 :         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
    1722             :                      RB_EMPTY_ROOT(&groups->tree));
    1723             : 
    1724           0 :         rb_erase(&event->group_node, &groups->tree);
    1725           0 :         init_event_group(event);
    1726           0 : }
    1727             : 
    1728             : /*
    1729             :  * Helper function to delete event from its groups.
    1730             :  */
    1731             : static void
    1732           0 : del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
    1733             : {
    1734           0 :         struct perf_event_groups *groups;
    1735             : 
    1736           0 :         groups = get_event_groups(event, ctx);
    1737           0 :         perf_event_groups_delete(groups, event);
    1738           0 : }
    1739             : 
    1740             : /*
    1741             :  * Get the leftmost event in the cpu/cgroup subtree.
    1742             :  */
    1743             : static struct perf_event *
    1744           0 : perf_event_groups_first(struct perf_event_groups *groups, int cpu,
    1745             :                         struct cgroup *cgrp)
    1746             : {
    1747           0 :         struct __group_key key = {
    1748             :                 .cpu = cpu,
    1749             :                 .cgroup = cgrp,
    1750             :         };
    1751           0 :         struct rb_node *node;
    1752             : 
    1753           0 :         node = rb_find_first(&key, &groups->tree, __group_cmp);
    1754           0 :         if (node)
    1755           0 :                 return __node_2_pe(node);
    1756             : 
    1757             :         return NULL;
    1758             : }
    1759             : 
    1760             : /*
    1761             :  * Like rb_entry_next_safe() for the @cpu subtree.
    1762             :  */
    1763             : static struct perf_event *
    1764           0 : perf_event_groups_next(struct perf_event *event)
    1765             : {
    1766           0 :         struct __group_key key = {
    1767           0 :                 .cpu = event->cpu,
    1768           0 :                 .cgroup = event_cgroup(event),
    1769             :         };
    1770           0 :         struct rb_node *next;
    1771             : 
    1772           0 :         next = rb_next_match(&key, &event->group_node, __group_cmp);
    1773           0 :         if (next)
    1774           0 :                 return __node_2_pe(next);
    1775             : 
    1776             :         return NULL;
    1777             : }
    1778             : 
    1779             : /*
    1780             :  * Iterate through the whole groups tree.
    1781             :  */
    1782             : #define perf_event_groups_for_each(event, groups)                       \
    1783             :         for (event = rb_entry_safe(rb_first(&((groups)->tree)),          \
    1784             :                                 typeof(*event), group_node); event;     \
    1785             :                 event = rb_entry_safe(rb_next(&event->group_node),       \
    1786             :                                 typeof(*event), group_node))
    1787             : 
    1788             : /*
    1789             :  * Add an event from the lists for its context.
    1790             :  * Must be called with ctx->mutex and ctx->lock held.
    1791             :  */
    1792             : static void
    1793           0 : list_add_event(struct perf_event *event, struct perf_event_context *ctx)
    1794             : {
    1795           0 :         lockdep_assert_held(&ctx->lock);
    1796             : 
    1797           0 :         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
    1798           0 :         event->attach_state |= PERF_ATTACH_CONTEXT;
    1799             : 
    1800           0 :         event->tstamp = perf_event_time(event);
    1801             : 
    1802             :         /*
    1803             :          * If we're a stand alone event or group leader, we go to the context
    1804             :          * list, group events are kept attached to the group so that
    1805             :          * perf_group_detach can, at all times, locate all siblings.
    1806             :          */
    1807           0 :         if (event->group_leader == event) {
    1808           0 :                 event->group_caps = event->event_caps;
    1809           0 :                 add_event_to_groups(event, ctx);
    1810             :         }
    1811             : 
    1812           0 :         list_add_rcu(&event->event_entry, &ctx->event_list);
    1813           0 :         ctx->nr_events++;
    1814           0 :         if (event->attr.inherit_stat)
    1815           0 :                 ctx->nr_stat++;
    1816             : 
    1817           0 :         if (event->state > PERF_EVENT_STATE_OFF)
    1818           0 :                 perf_cgroup_event_enable(event, ctx);
    1819             : 
    1820           0 :         ctx->generation++;
    1821           0 : }
    1822             : 
    1823             : /*
    1824             :  * Initialize event state based on the perf_event_attr::disabled.
    1825             :  */
    1826           0 : static inline void perf_event__state_init(struct perf_event *event)
    1827             : {
    1828           0 :         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
    1829             :                                               PERF_EVENT_STATE_INACTIVE;
    1830             : }
    1831             : 
    1832           0 : static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
    1833             : {
    1834           0 :         int entry = sizeof(u64); /* value */
    1835           0 :         int size = 0;
    1836           0 :         int nr = 1;
    1837             : 
    1838           0 :         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
    1839           0 :                 size += sizeof(u64);
    1840             : 
    1841           0 :         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
    1842           0 :                 size += sizeof(u64);
    1843             : 
    1844           0 :         if (event->attr.read_format & PERF_FORMAT_ID)
    1845           0 :                 entry += sizeof(u64);
    1846             : 
    1847           0 :         if (event->attr.read_format & PERF_FORMAT_GROUP) {
    1848           0 :                 nr += nr_siblings;
    1849           0 :                 size += sizeof(u64);
    1850             :         }
    1851             : 
    1852           0 :         size += entry * nr;
    1853           0 :         event->read_size = size;
    1854           0 : }
    1855             : 
    1856           0 : static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
    1857             : {
    1858           0 :         struct perf_sample_data *data;
    1859           0 :         u16 size = 0;
    1860             : 
    1861           0 :         if (sample_type & PERF_SAMPLE_IP)
    1862           0 :                 size += sizeof(data->ip);
    1863             : 
    1864           0 :         if (sample_type & PERF_SAMPLE_ADDR)
    1865           0 :                 size += sizeof(data->addr);
    1866             : 
    1867           0 :         if (sample_type & PERF_SAMPLE_PERIOD)
    1868           0 :                 size += sizeof(data->period);
    1869             : 
    1870           0 :         if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
    1871           0 :                 size += sizeof(data->weight.full);
    1872             : 
    1873           0 :         if (sample_type & PERF_SAMPLE_READ)
    1874           0 :                 size += event->read_size;
    1875             : 
    1876           0 :         if (sample_type & PERF_SAMPLE_DATA_SRC)
    1877           0 :                 size += sizeof(data->data_src.val);
    1878             : 
    1879           0 :         if (sample_type & PERF_SAMPLE_TRANSACTION)
    1880           0 :                 size += sizeof(data->txn);
    1881             : 
    1882           0 :         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
    1883           0 :                 size += sizeof(data->phys_addr);
    1884             : 
    1885           0 :         if (sample_type & PERF_SAMPLE_CGROUP)
    1886           0 :                 size += sizeof(data->cgroup);
    1887             : 
    1888           0 :         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
    1889           0 :                 size += sizeof(data->data_page_size);
    1890             : 
    1891           0 :         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
    1892           0 :                 size += sizeof(data->code_page_size);
    1893             : 
    1894           0 :         event->header_size = size;
    1895           0 : }
    1896             : 
    1897             : /*
    1898             :  * Called at perf_event creation and when events are attached/detached from a
    1899             :  * group.
    1900             :  */
    1901           0 : static void perf_event__header_size(struct perf_event *event)
    1902             : {
    1903           0 :         __perf_event_read_size(event,
    1904           0 :                                event->group_leader->nr_siblings);
    1905           0 :         __perf_event_header_size(event, event->attr.sample_type);
    1906           0 : }
    1907             : 
    1908           0 : static void perf_event__id_header_size(struct perf_event *event)
    1909             : {
    1910           0 :         struct perf_sample_data *data;
    1911           0 :         u64 sample_type = event->attr.sample_type;
    1912           0 :         u16 size = 0;
    1913             : 
    1914           0 :         if (sample_type & PERF_SAMPLE_TID)
    1915           0 :                 size += sizeof(data->tid_entry);
    1916             : 
    1917           0 :         if (sample_type & PERF_SAMPLE_TIME)
    1918           0 :                 size += sizeof(data->time);
    1919             : 
    1920           0 :         if (sample_type & PERF_SAMPLE_IDENTIFIER)
    1921           0 :                 size += sizeof(data->id);
    1922             : 
    1923           0 :         if (sample_type & PERF_SAMPLE_ID)
    1924           0 :                 size += sizeof(data->id);
    1925             : 
    1926           0 :         if (sample_type & PERF_SAMPLE_STREAM_ID)
    1927           0 :                 size += sizeof(data->stream_id);
    1928             : 
    1929           0 :         if (sample_type & PERF_SAMPLE_CPU)
    1930           0 :                 size += sizeof(data->cpu_entry);
    1931             : 
    1932           0 :         event->id_header_size = size;
    1933           0 : }
    1934             : 
    1935           0 : static bool perf_event_validate_size(struct perf_event *event)
    1936             : {
    1937             :         /*
    1938             :          * The values computed here will be over-written when we actually
    1939             :          * attach the event.
    1940             :          */
    1941           0 :         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
    1942           0 :         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
    1943           0 :         perf_event__id_header_size(event);
    1944             : 
    1945             :         /*
    1946             :          * Sum the lot; should not exceed the 64k limit we have on records.
    1947             :          * Conservative limit to allow for callchains and other variable fields.
    1948             :          */
    1949           0 :         if (event->read_size + event->header_size +
    1950           0 :             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
    1951           0 :                 return false;
    1952             : 
    1953             :         return true;
    1954             : }
    1955             : 
    1956           0 : static void perf_group_attach(struct perf_event *event)
    1957             : {
    1958           0 :         struct perf_event *group_leader = event->group_leader, *pos;
    1959             : 
    1960           0 :         lockdep_assert_held(&event->ctx->lock);
    1961             : 
    1962             :         /*
    1963             :          * We can have double attach due to group movement in perf_event_open.
    1964             :          */
    1965           0 :         if (event->attach_state & PERF_ATTACH_GROUP)
    1966             :                 return;
    1967             : 
    1968           0 :         event->attach_state |= PERF_ATTACH_GROUP;
    1969             : 
    1970           0 :         if (group_leader == event)
    1971             :                 return;
    1972             : 
    1973           0 :         WARN_ON_ONCE(group_leader->ctx != event->ctx);
    1974             : 
    1975           0 :         group_leader->group_caps &= event->event_caps;
    1976             : 
    1977           0 :         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
    1978           0 :         group_leader->nr_siblings++;
    1979             : 
    1980           0 :         perf_event__header_size(group_leader);
    1981             : 
    1982           0 :         for_each_sibling_event(pos, group_leader)
    1983           0 :                 perf_event__header_size(pos);
    1984             : }
    1985             : 
    1986             : /*
    1987             :  * Remove an event from the lists for its context.
    1988             :  * Must be called with ctx->mutex and ctx->lock held.
    1989             :  */
    1990             : static void
    1991           0 : list_del_event(struct perf_event *event, struct perf_event_context *ctx)
    1992             : {
    1993           0 :         WARN_ON_ONCE(event->ctx != ctx);
    1994           0 :         lockdep_assert_held(&ctx->lock);
    1995             : 
    1996             :         /*
    1997             :          * We can have double detach due to exit/hot-unplug + close.
    1998             :          */
    1999           0 :         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
    2000             :                 return;
    2001             : 
    2002           0 :         event->attach_state &= ~PERF_ATTACH_CONTEXT;
    2003             : 
    2004           0 :         ctx->nr_events--;
    2005           0 :         if (event->attr.inherit_stat)
    2006           0 :                 ctx->nr_stat--;
    2007             : 
    2008           0 :         list_del_rcu(&event->event_entry);
    2009             : 
    2010           0 :         if (event->group_leader == event)
    2011           0 :                 del_event_from_groups(event, ctx);
    2012             : 
    2013             :         /*
    2014             :          * If event was in error state, then keep it
    2015             :          * that way, otherwise bogus counts will be
    2016             :          * returned on read(). The only way to get out
    2017             :          * of error state is by explicit re-enabling
    2018             :          * of the event
    2019             :          */
    2020           0 :         if (event->state > PERF_EVENT_STATE_OFF) {
    2021           0 :                 perf_cgroup_event_disable(event, ctx);
    2022           0 :                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
    2023             :         }
    2024             : 
    2025           0 :         ctx->generation++;
    2026             : }
    2027             : 
    2028             : static int
    2029           0 : perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
    2030             : {
    2031           0 :         if (!has_aux(aux_event))
    2032             :                 return 0;
    2033             : 
    2034           0 :         if (!event->pmu->aux_output_match)
    2035             :                 return 0;
    2036             : 
    2037           0 :         return event->pmu->aux_output_match(aux_event);
    2038             : }
    2039             : 
    2040             : static void put_event(struct perf_event *event);
    2041             : static void event_sched_out(struct perf_event *event,
    2042             :                             struct perf_cpu_context *cpuctx,
    2043             :                             struct perf_event_context *ctx);
    2044             : 
    2045           0 : static void perf_put_aux_event(struct perf_event *event)
    2046             : {
    2047           0 :         struct perf_event_context *ctx = event->ctx;
    2048           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    2049           0 :         struct perf_event *iter;
    2050             : 
    2051             :         /*
    2052             :          * If event uses aux_event tear down the link
    2053             :          */
    2054           0 :         if (event->aux_event) {
    2055           0 :                 iter = event->aux_event;
    2056           0 :                 event->aux_event = NULL;
    2057           0 :                 put_event(iter);
    2058           0 :                 return;
    2059             :         }
    2060             : 
    2061             :         /*
    2062             :          * If the event is an aux_event, tear down all links to
    2063             :          * it from other events.
    2064             :          */
    2065           0 :         for_each_sibling_event(iter, event->group_leader) {
    2066           0 :                 if (iter->aux_event != event)
    2067           0 :                         continue;
    2068             : 
    2069           0 :                 iter->aux_event = NULL;
    2070           0 :                 put_event(event);
    2071             : 
    2072             :                 /*
    2073             :                  * If it's ACTIVE, schedule it out and put it into ERROR
    2074             :                  * state so that we don't try to schedule it again. Note
    2075             :                  * that perf_event_enable() will clear the ERROR status.
    2076             :                  */
    2077           0 :                 event_sched_out(iter, cpuctx, ctx);
    2078           0 :                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
    2079             :         }
    2080             : }
    2081             : 
    2082           0 : static bool perf_need_aux_event(struct perf_event *event)
    2083             : {
    2084           0 :         return !!event->attr.aux_output || !!event->attr.aux_sample_size;
    2085             : }
    2086             : 
    2087           0 : static int perf_get_aux_event(struct perf_event *event,
    2088             :                               struct perf_event *group_leader)
    2089             : {
    2090             :         /*
    2091             :          * Our group leader must be an aux event if we want to be
    2092             :          * an aux_output. This way, the aux event will precede its
    2093             :          * aux_output events in the group, and therefore will always
    2094             :          * schedule first.
    2095             :          */
    2096           0 :         if (!group_leader)
    2097             :                 return 0;
    2098             : 
    2099             :         /*
    2100             :          * aux_output and aux_sample_size are mutually exclusive.
    2101             :          */
    2102           0 :         if (event->attr.aux_output && event->attr.aux_sample_size)
    2103             :                 return 0;
    2104             : 
    2105           0 :         if (event->attr.aux_output &&
    2106           0 :             !perf_aux_output_match(event, group_leader))
    2107             :                 return 0;
    2108             : 
    2109           0 :         if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
    2110             :                 return 0;
    2111             : 
    2112           0 :         if (!atomic_long_inc_not_zero(&group_leader->refcount))
    2113             :                 return 0;
    2114             : 
    2115             :         /*
    2116             :          * Link aux_outputs to their aux event; this is undone in
    2117             :          * perf_group_detach() by perf_put_aux_event(). When the
    2118             :          * group in torn down, the aux_output events loose their
    2119             :          * link to the aux_event and can't schedule any more.
    2120             :          */
    2121           0 :         event->aux_event = group_leader;
    2122             : 
    2123           0 :         return 1;
    2124             : }
    2125             : 
    2126           0 : static inline struct list_head *get_event_list(struct perf_event *event)
    2127             : {
    2128           0 :         struct perf_event_context *ctx = event->ctx;
    2129           0 :         return event->attr.pinned ? &ctx->pinned_active : &ctx->flexible_active;
    2130             : }
    2131             : 
    2132             : /*
    2133             :  * Events that have PERF_EV_CAP_SIBLING require being part of a group and
    2134             :  * cannot exist on their own, schedule them out and move them into the ERROR
    2135             :  * state. Also see _perf_event_enable(), it will not be able to recover
    2136             :  * this ERROR state.
    2137             :  */
    2138           0 : static inline void perf_remove_sibling_event(struct perf_event *event)
    2139             : {
    2140           0 :         struct perf_event_context *ctx = event->ctx;
    2141           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    2142             : 
    2143           0 :         event_sched_out(event, cpuctx, ctx);
    2144           0 :         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
    2145           0 : }
    2146             : 
    2147           0 : static void perf_group_detach(struct perf_event *event)
    2148             : {
    2149           0 :         struct perf_event *leader = event->group_leader;
    2150           0 :         struct perf_event *sibling, *tmp;
    2151           0 :         struct perf_event_context *ctx = event->ctx;
    2152             : 
    2153           0 :         lockdep_assert_held(&ctx->lock);
    2154             : 
    2155             :         /*
    2156             :          * We can have double detach due to exit/hot-unplug + close.
    2157             :          */
    2158           0 :         if (!(event->attach_state & PERF_ATTACH_GROUP))
    2159             :                 return;
    2160             : 
    2161           0 :         event->attach_state &= ~PERF_ATTACH_GROUP;
    2162             : 
    2163           0 :         perf_put_aux_event(event);
    2164             : 
    2165             :         /*
    2166             :          * If this is a sibling, remove it from its group.
    2167             :          */
    2168           0 :         if (leader != event) {
    2169           0 :                 list_del_init(&event->sibling_list);
    2170           0 :                 event->group_leader->nr_siblings--;
    2171           0 :                 goto out;
    2172             :         }
    2173             : 
    2174             :         /*
    2175             :          * If this was a group event with sibling events then
    2176             :          * upgrade the siblings to singleton events by adding them
    2177             :          * to whatever list we are on.
    2178             :          */
    2179           0 :         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
    2180             : 
    2181           0 :                 if (sibling->event_caps & PERF_EV_CAP_SIBLING)
    2182           0 :                         perf_remove_sibling_event(sibling);
    2183             : 
    2184           0 :                 sibling->group_leader = sibling;
    2185           0 :                 list_del_init(&sibling->sibling_list);
    2186             : 
    2187             :                 /* Inherit group flags from the previous leader */
    2188           0 :                 sibling->group_caps = event->group_caps;
    2189             : 
    2190           0 :                 if (!RB_EMPTY_NODE(&event->group_node)) {
    2191           0 :                         add_event_to_groups(sibling, event->ctx);
    2192             : 
    2193           0 :                         if (sibling->state == PERF_EVENT_STATE_ACTIVE)
    2194           0 :                                 list_add_tail(&sibling->active_list, get_event_list(sibling));
    2195             :                 }
    2196             : 
    2197           0 :                 WARN_ON_ONCE(sibling->ctx != event->ctx);
    2198             :         }
    2199             : 
    2200           0 : out:
    2201           0 :         for_each_sibling_event(tmp, leader)
    2202           0 :                 perf_event__header_size(tmp);
    2203             : 
    2204           0 :         perf_event__header_size(leader);
    2205             : }
    2206             : 
    2207           0 : static bool is_orphaned_event(struct perf_event *event)
    2208             : {
    2209           0 :         return event->state == PERF_EVENT_STATE_DEAD;
    2210             : }
    2211             : 
    2212           0 : static inline int __pmu_filter_match(struct perf_event *event)
    2213             : {
    2214           0 :         struct pmu *pmu = event->pmu;
    2215           0 :         return pmu->filter_match ? pmu->filter_match(event) : 1;
    2216             : }
    2217             : 
    2218             : /*
    2219             :  * Check whether we should attempt to schedule an event group based on
    2220             :  * PMU-specific filtering. An event group can consist of HW and SW events,
    2221             :  * potentially with a SW leader, so we must check all the filters, to
    2222             :  * determine whether a group is schedulable:
    2223             :  */
    2224           0 : static inline int pmu_filter_match(struct perf_event *event)
    2225             : {
    2226           0 :         struct perf_event *sibling;
    2227             : 
    2228           0 :         if (!__pmu_filter_match(event))
    2229             :                 return 0;
    2230             : 
    2231           0 :         for_each_sibling_event(sibling, event) {
    2232           0 :                 if (!__pmu_filter_match(sibling))
    2233             :                         return 0;
    2234             :         }
    2235             : 
    2236             :         return 1;
    2237             : }
    2238             : 
    2239             : static inline int
    2240           0 : event_filter_match(struct perf_event *event)
    2241             : {
    2242           0 :         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
    2243           0 :                perf_cgroup_match(event) && pmu_filter_match(event);
    2244             : }
    2245             : 
    2246             : static void
    2247           0 : event_sched_out(struct perf_event *event,
    2248             :                   struct perf_cpu_context *cpuctx,
    2249             :                   struct perf_event_context *ctx)
    2250             : {
    2251           0 :         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
    2252             : 
    2253           0 :         WARN_ON_ONCE(event->ctx != ctx);
    2254           0 :         lockdep_assert_held(&ctx->lock);
    2255             : 
    2256           0 :         if (event->state != PERF_EVENT_STATE_ACTIVE)
    2257             :                 return;
    2258             : 
    2259             :         /*
    2260             :          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
    2261             :          * we can schedule events _OUT_ individually through things like
    2262             :          * __perf_remove_from_context().
    2263             :          */
    2264           0 :         list_del_init(&event->active_list);
    2265             : 
    2266           0 :         perf_pmu_disable(event->pmu);
    2267             : 
    2268           0 :         event->pmu->del(event, 0);
    2269           0 :         event->oncpu = -1;
    2270             : 
    2271           0 :         if (READ_ONCE(event->pending_disable) >= 0) {
    2272           0 :                 WRITE_ONCE(event->pending_disable, -1);
    2273           0 :                 perf_cgroup_event_disable(event, ctx);
    2274           0 :                 state = PERF_EVENT_STATE_OFF;
    2275             :         }
    2276           0 :         perf_event_set_state(event, state);
    2277             : 
    2278           0 :         if (!is_software_event(event))
    2279           0 :                 cpuctx->active_oncpu--;
    2280           0 :         if (!--ctx->nr_active)
    2281           0 :                 perf_event_ctx_deactivate(ctx);
    2282           0 :         if (event->attr.freq && event->attr.sample_freq)
    2283           0 :                 ctx->nr_freq--;
    2284           0 :         if (event->attr.exclusive || !cpuctx->active_oncpu)
    2285           0 :                 cpuctx->exclusive = 0;
    2286             : 
    2287           0 :         perf_pmu_enable(event->pmu);
    2288             : }
    2289             : 
    2290             : static void
    2291           0 : group_sched_out(struct perf_event *group_event,
    2292             :                 struct perf_cpu_context *cpuctx,
    2293             :                 struct perf_event_context *ctx)
    2294             : {
    2295           0 :         struct perf_event *event;
    2296             : 
    2297           0 :         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
    2298             :                 return;
    2299             : 
    2300           0 :         perf_pmu_disable(ctx->pmu);
    2301             : 
    2302           0 :         event_sched_out(group_event, cpuctx, ctx);
    2303             : 
    2304             :         /*
    2305             :          * Schedule out siblings (if any):
    2306             :          */
    2307           0 :         for_each_sibling_event(event, group_event)
    2308           0 :                 event_sched_out(event, cpuctx, ctx);
    2309             : 
    2310           0 :         perf_pmu_enable(ctx->pmu);
    2311             : }
    2312             : 
    2313             : #define DETACH_GROUP    0x01UL
    2314             : 
    2315             : /*
    2316             :  * Cross CPU call to remove a performance event
    2317             :  *
    2318             :  * We disable the event on the hardware level first. After that we
    2319             :  * remove it from the context list.
    2320             :  */
    2321             : static void
    2322           0 : __perf_remove_from_context(struct perf_event *event,
    2323             :                            struct perf_cpu_context *cpuctx,
    2324             :                            struct perf_event_context *ctx,
    2325             :                            void *info)
    2326             : {
    2327           0 :         unsigned long flags = (unsigned long)info;
    2328             : 
    2329           0 :         if (ctx->is_active & EVENT_TIME) {
    2330           0 :                 update_context_time(ctx);
    2331           0 :                 update_cgrp_time_from_cpuctx(cpuctx);
    2332             :         }
    2333             : 
    2334           0 :         event_sched_out(event, cpuctx, ctx);
    2335           0 :         if (flags & DETACH_GROUP)
    2336           0 :                 perf_group_detach(event);
    2337           0 :         list_del_event(event, ctx);
    2338             : 
    2339           0 :         if (!ctx->nr_events && ctx->is_active) {
    2340           0 :                 ctx->is_active = 0;
    2341           0 :                 ctx->rotate_necessary = 0;
    2342           0 :                 if (ctx->task) {
    2343           0 :                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
    2344           0 :                         cpuctx->task_ctx = NULL;
    2345             :                 }
    2346             :         }
    2347           0 : }
    2348             : 
    2349             : /*
    2350             :  * Remove the event from a task's (or a CPU's) list of events.
    2351             :  *
    2352             :  * If event->ctx is a cloned context, callers must make sure that
    2353             :  * every task struct that event->ctx->task could possibly point to
    2354             :  * remains valid.  This is OK when called from perf_release since
    2355             :  * that only calls us on the top-level context, which can't be a clone.
    2356             :  * When called from perf_event_exit_task, it's OK because the
    2357             :  * context has been detached from its task.
    2358             :  */
    2359           0 : static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
    2360             : {
    2361           0 :         struct perf_event_context *ctx = event->ctx;
    2362             : 
    2363           0 :         lockdep_assert_held(&ctx->mutex);
    2364             : 
    2365           0 :         event_function_call(event, __perf_remove_from_context, (void *)flags);
    2366             : 
    2367             :         /*
    2368             :          * The above event_function_call() can NO-OP when it hits
    2369             :          * TASK_TOMBSTONE. In that case we must already have been detached
    2370             :          * from the context (by perf_event_exit_event()) but the grouping
    2371             :          * might still be in-tact.
    2372             :          */
    2373           0 :         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
    2374           0 :         if ((flags & DETACH_GROUP) &&
    2375           0 :             (event->attach_state & PERF_ATTACH_GROUP)) {
    2376             :                 /*
    2377             :                  * Since in that case we cannot possibly be scheduled, simply
    2378             :                  * detach now.
    2379             :                  */
    2380           0 :                 raw_spin_lock_irq(&ctx->lock);
    2381           0 :                 perf_group_detach(event);
    2382           0 :                 raw_spin_unlock_irq(&ctx->lock);
    2383             :         }
    2384           0 : }
    2385             : 
    2386             : /*
    2387             :  * Cross CPU call to disable a performance event
    2388             :  */
    2389           0 : static void __perf_event_disable(struct perf_event *event,
    2390             :                                  struct perf_cpu_context *cpuctx,
    2391             :                                  struct perf_event_context *ctx,
    2392             :                                  void *info)
    2393             : {
    2394           0 :         if (event->state < PERF_EVENT_STATE_INACTIVE)
    2395             :                 return;
    2396             : 
    2397           0 :         if (ctx->is_active & EVENT_TIME) {
    2398           0 :                 update_context_time(ctx);
    2399           0 :                 update_cgrp_time_from_event(event);
    2400             :         }
    2401             : 
    2402           0 :         if (event == event->group_leader)
    2403           0 :                 group_sched_out(event, cpuctx, ctx);
    2404             :         else
    2405           0 :                 event_sched_out(event, cpuctx, ctx);
    2406             : 
    2407           0 :         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
    2408           0 :         perf_cgroup_event_disable(event, ctx);
    2409             : }
    2410             : 
    2411             : /*
    2412             :  * Disable an event.
    2413             :  *
    2414             :  * If event->ctx is a cloned context, callers must make sure that
    2415             :  * every task struct that event->ctx->task could possibly point to
    2416             :  * remains valid.  This condition is satisfied when called through
    2417             :  * perf_event_for_each_child or perf_event_for_each because they
    2418             :  * hold the top-level event's child_mutex, so any descendant that
    2419             :  * goes to exit will block in perf_event_exit_event().
    2420             :  *
    2421             :  * When called from perf_pending_event it's OK because event->ctx
    2422             :  * is the current context on this CPU and preemption is disabled,
    2423             :  * hence we can't get into perf_event_task_sched_out for this context.
    2424             :  */
    2425           0 : static void _perf_event_disable(struct perf_event *event)
    2426             : {
    2427           0 :         struct perf_event_context *ctx = event->ctx;
    2428             : 
    2429           0 :         raw_spin_lock_irq(&ctx->lock);
    2430           0 :         if (event->state <= PERF_EVENT_STATE_OFF) {
    2431           0 :                 raw_spin_unlock_irq(&ctx->lock);
    2432           0 :                 return;
    2433             :         }
    2434           0 :         raw_spin_unlock_irq(&ctx->lock);
    2435             : 
    2436           0 :         event_function_call(event, __perf_event_disable, NULL);
    2437             : }
    2438             : 
    2439           0 : void perf_event_disable_local(struct perf_event *event)
    2440             : {
    2441           0 :         event_function_local(event, __perf_event_disable, NULL);
    2442           0 : }
    2443             : 
    2444             : /*
    2445             :  * Strictly speaking kernel users cannot create groups and therefore this
    2446             :  * interface does not need the perf_event_ctx_lock() magic.
    2447             :  */
    2448           0 : void perf_event_disable(struct perf_event *event)
    2449             : {
    2450           0 :         struct perf_event_context *ctx;
    2451             : 
    2452           0 :         ctx = perf_event_ctx_lock(event);
    2453           0 :         _perf_event_disable(event);
    2454           0 :         perf_event_ctx_unlock(event, ctx);
    2455           0 : }
    2456             : EXPORT_SYMBOL_GPL(perf_event_disable);
    2457             : 
    2458           0 : void perf_event_disable_inatomic(struct perf_event *event)
    2459             : {
    2460           0 :         WRITE_ONCE(event->pending_disable, smp_processor_id());
    2461             :         /* can fail, see perf_pending_event_disable() */
    2462           0 :         irq_work_queue(&event->pending);
    2463           0 : }
    2464             : 
    2465           0 : static void perf_set_shadow_time(struct perf_event *event,
    2466             :                                  struct perf_event_context *ctx)
    2467             : {
    2468             :         /*
    2469             :          * use the correct time source for the time snapshot
    2470             :          *
    2471             :          * We could get by without this by leveraging the
    2472             :          * fact that to get to this function, the caller
    2473             :          * has most likely already called update_context_time()
    2474             :          * and update_cgrp_time_xx() and thus both timestamp
    2475             :          * are identical (or very close). Given that tstamp is,
    2476             :          * already adjusted for cgroup, we could say that:
    2477             :          *    tstamp - ctx->timestamp
    2478             :          * is equivalent to
    2479             :          *    tstamp - cgrp->timestamp.
    2480             :          *
    2481             :          * Then, in perf_output_read(), the calculation would
    2482             :          * work with no changes because:
    2483             :          * - event is guaranteed scheduled in
    2484             :          * - no scheduled out in between
    2485             :          * - thus the timestamp would be the same
    2486             :          *
    2487             :          * But this is a bit hairy.
    2488             :          *
    2489             :          * So instead, we have an explicit cgroup call to remain
    2490             :          * within the time source all along. We believe it
    2491             :          * is cleaner and simpler to understand.
    2492             :          */
    2493           0 :         if (is_cgroup_event(event))
    2494           0 :                 perf_cgroup_set_shadow_time(event, event->tstamp);
    2495             :         else
    2496           0 :                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
    2497             : }
    2498             : 
    2499             : #define MAX_INTERRUPTS (~0ULL)
    2500             : 
    2501             : static void perf_log_throttle(struct perf_event *event, int enable);
    2502             : static void perf_log_itrace_start(struct perf_event *event);
    2503             : 
    2504             : static int
    2505           0 : event_sched_in(struct perf_event *event,
    2506             :                  struct perf_cpu_context *cpuctx,
    2507             :                  struct perf_event_context *ctx)
    2508             : {
    2509           0 :         int ret = 0;
    2510             : 
    2511           0 :         WARN_ON_ONCE(event->ctx != ctx);
    2512             : 
    2513           0 :         lockdep_assert_held(&ctx->lock);
    2514             : 
    2515           0 :         if (event->state <= PERF_EVENT_STATE_OFF)
    2516             :                 return 0;
    2517             : 
    2518           0 :         WRITE_ONCE(event->oncpu, smp_processor_id());
    2519             :         /*
    2520             :          * Order event::oncpu write to happen before the ACTIVE state is
    2521             :          * visible. This allows perf_event_{stop,read}() to observe the correct
    2522             :          * ->oncpu if it sees ACTIVE.
    2523             :          */
    2524           0 :         smp_wmb();
    2525           0 :         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
    2526             : 
    2527             :         /*
    2528             :          * Unthrottle events, since we scheduled we might have missed several
    2529             :          * ticks already, also for a heavily scheduling task there is little
    2530             :          * guarantee it'll get a tick in a timely manner.
    2531             :          */
    2532           0 :         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
    2533           0 :                 perf_log_throttle(event, 1);
    2534           0 :                 event->hw.interrupts = 0;
    2535             :         }
    2536             : 
    2537           0 :         perf_pmu_disable(event->pmu);
    2538             : 
    2539           0 :         perf_set_shadow_time(event, ctx);
    2540             : 
    2541           0 :         perf_log_itrace_start(event);
    2542             : 
    2543           0 :         if (event->pmu->add(event, PERF_EF_START)) {
    2544           0 :                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
    2545           0 :                 event->oncpu = -1;
    2546           0 :                 ret = -EAGAIN;
    2547           0 :                 goto out;
    2548             :         }
    2549             : 
    2550           0 :         if (!is_software_event(event))
    2551           0 :                 cpuctx->active_oncpu++;
    2552           0 :         if (!ctx->nr_active++)
    2553           0 :                 perf_event_ctx_activate(ctx);
    2554           0 :         if (event->attr.freq && event->attr.sample_freq)
    2555           0 :                 ctx->nr_freq++;
    2556             : 
    2557           0 :         if (event->attr.exclusive)
    2558           0 :                 cpuctx->exclusive = 1;
    2559             : 
    2560           0 : out:
    2561           0 :         perf_pmu_enable(event->pmu);
    2562             : 
    2563           0 :         return ret;
    2564             : }
    2565             : 
    2566             : static int
    2567           0 : group_sched_in(struct perf_event *group_event,
    2568             :                struct perf_cpu_context *cpuctx,
    2569             :                struct perf_event_context *ctx)
    2570             : {
    2571           0 :         struct perf_event *event, *partial_group = NULL;
    2572           0 :         struct pmu *pmu = ctx->pmu;
    2573             : 
    2574           0 :         if (group_event->state == PERF_EVENT_STATE_OFF)
    2575             :                 return 0;
    2576             : 
    2577           0 :         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
    2578             : 
    2579           0 :         if (event_sched_in(group_event, cpuctx, ctx))
    2580           0 :                 goto error;
    2581             : 
    2582             :         /*
    2583             :          * Schedule in siblings as one group (if any):
    2584             :          */
    2585           0 :         for_each_sibling_event(event, group_event) {
    2586           0 :                 if (event_sched_in(event, cpuctx, ctx)) {
    2587           0 :                         partial_group = event;
    2588           0 :                         goto group_error;
    2589             :                 }
    2590             :         }
    2591             : 
    2592           0 :         if (!pmu->commit_txn(pmu))
    2593             :                 return 0;
    2594             : 
    2595           0 : group_error:
    2596             :         /*
    2597             :          * Groups can be scheduled in as one unit only, so undo any
    2598             :          * partial group before returning:
    2599             :          * The events up to the failed event are scheduled out normally.
    2600             :          */
    2601           0 :         for_each_sibling_event(event, group_event) {
    2602           0 :                 if (event == partial_group)
    2603             :                         break;
    2604             : 
    2605           0 :                 event_sched_out(event, cpuctx, ctx);
    2606             :         }
    2607           0 :         event_sched_out(group_event, cpuctx, ctx);
    2608             : 
    2609           0 : error:
    2610           0 :         pmu->cancel_txn(pmu);
    2611           0 :         return -EAGAIN;
    2612             : }
    2613             : 
    2614             : /*
    2615             :  * Work out whether we can put this event group on the CPU now.
    2616             :  */
    2617           0 : static int group_can_go_on(struct perf_event *event,
    2618             :                            struct perf_cpu_context *cpuctx,
    2619             :                            int can_add_hw)
    2620             : {
    2621             :         /*
    2622             :          * Groups consisting entirely of software events can always go on.
    2623             :          */
    2624           0 :         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
    2625             :                 return 1;
    2626             :         /*
    2627             :          * If an exclusive group is already on, no other hardware
    2628             :          * events can go on.
    2629             :          */
    2630           0 :         if (cpuctx->exclusive)
    2631             :                 return 0;
    2632             :         /*
    2633             :          * If this group is exclusive and there are already
    2634             :          * events on the CPU, it can't go on.
    2635             :          */
    2636           0 :         if (event->attr.exclusive && !list_empty(get_event_list(event)))
    2637           0 :                 return 0;
    2638             :         /*
    2639             :          * Otherwise, try to add it if all previous groups were able
    2640             :          * to go on.
    2641             :          */
    2642             :         return can_add_hw;
    2643             : }
    2644             : 
    2645           0 : static void add_event_to_ctx(struct perf_event *event,
    2646             :                                struct perf_event_context *ctx)
    2647             : {
    2648           0 :         list_add_event(event, ctx);
    2649           0 :         perf_group_attach(event);
    2650           0 : }
    2651             : 
    2652             : static void ctx_sched_out(struct perf_event_context *ctx,
    2653             :                           struct perf_cpu_context *cpuctx,
    2654             :                           enum event_type_t event_type);
    2655             : static void
    2656             : ctx_sched_in(struct perf_event_context *ctx,
    2657             :              struct perf_cpu_context *cpuctx,
    2658             :              enum event_type_t event_type,
    2659             :              struct task_struct *task);
    2660             : 
    2661           0 : static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
    2662             :                                struct perf_event_context *ctx,
    2663             :                                enum event_type_t event_type)
    2664             : {
    2665           0 :         if (!cpuctx->task_ctx)
    2666             :                 return;
    2667             : 
    2668           0 :         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
    2669             :                 return;
    2670             : 
    2671           0 :         ctx_sched_out(ctx, cpuctx, event_type);
    2672             : }
    2673             : 
    2674           0 : static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
    2675             :                                 struct perf_event_context *ctx,
    2676             :                                 struct task_struct *task)
    2677             : {
    2678           0 :         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
    2679           0 :         if (ctx)
    2680           0 :                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
    2681           0 :         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
    2682           0 :         if (ctx)
    2683           0 :                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
    2684           0 : }
    2685             : 
    2686             : /*
    2687             :  * We want to maintain the following priority of scheduling:
    2688             :  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
    2689             :  *  - task pinned (EVENT_PINNED)
    2690             :  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
    2691             :  *  - task flexible (EVENT_FLEXIBLE).
    2692             :  *
    2693             :  * In order to avoid unscheduling and scheduling back in everything every
    2694             :  * time an event is added, only do it for the groups of equal priority and
    2695             :  * below.
    2696             :  *
    2697             :  * This can be called after a batch operation on task events, in which case
    2698             :  * event_type is a bit mask of the types of events involved. For CPU events,
    2699             :  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
    2700             :  */
    2701           0 : static void ctx_resched(struct perf_cpu_context *cpuctx,
    2702             :                         struct perf_event_context *task_ctx,
    2703             :                         enum event_type_t event_type)
    2704             : {
    2705           0 :         enum event_type_t ctx_event_type;
    2706           0 :         bool cpu_event = !!(event_type & EVENT_CPU);
    2707             : 
    2708             :         /*
    2709             :          * If pinned groups are involved, flexible groups also need to be
    2710             :          * scheduled out.
    2711             :          */
    2712           0 :         if (event_type & EVENT_PINNED)
    2713           0 :                 event_type |= EVENT_FLEXIBLE;
    2714             : 
    2715           0 :         ctx_event_type = event_type & EVENT_ALL;
    2716             : 
    2717           0 :         perf_pmu_disable(cpuctx->ctx.pmu);
    2718           0 :         if (task_ctx)
    2719           0 :                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
    2720             : 
    2721             :         /*
    2722             :          * Decide which cpu ctx groups to schedule out based on the types
    2723             :          * of events that caused rescheduling:
    2724             :          *  - EVENT_CPU: schedule out corresponding groups;
    2725             :          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
    2726             :          *  - otherwise, do nothing more.
    2727             :          */
    2728           0 :         if (cpu_event)
    2729           0 :                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
    2730           0 :         else if (ctx_event_type & EVENT_PINNED)
    2731           0 :                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
    2732             : 
    2733           0 :         perf_event_sched_in(cpuctx, task_ctx, current);
    2734           0 :         perf_pmu_enable(cpuctx->ctx.pmu);
    2735           0 : }
    2736             : 
    2737           0 : void perf_pmu_resched(struct pmu *pmu)
    2738             : {
    2739           0 :         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
    2740           0 :         struct perf_event_context *task_ctx = cpuctx->task_ctx;
    2741             : 
    2742           0 :         perf_ctx_lock(cpuctx, task_ctx);
    2743           0 :         ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
    2744           0 :         perf_ctx_unlock(cpuctx, task_ctx);
    2745           0 : }
    2746             : 
    2747             : /*
    2748             :  * Cross CPU call to install and enable a performance event
    2749             :  *
    2750             :  * Very similar to remote_function() + event_function() but cannot assume that
    2751             :  * things like ctx->is_active and cpuctx->task_ctx are set.
    2752             :  */
    2753           0 : static int  __perf_install_in_context(void *info)
    2754             : {
    2755           0 :         struct perf_event *event = info;
    2756           0 :         struct perf_event_context *ctx = event->ctx;
    2757           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    2758           0 :         struct perf_event_context *task_ctx = cpuctx->task_ctx;
    2759           0 :         bool reprogram = true;
    2760           0 :         int ret = 0;
    2761             : 
    2762           0 :         raw_spin_lock(&cpuctx->ctx.lock);
    2763           0 :         if (ctx->task) {
    2764           0 :                 raw_spin_lock(&ctx->lock);
    2765           0 :                 task_ctx = ctx;
    2766             : 
    2767           0 :                 reprogram = (ctx->task == current);
    2768             : 
    2769             :                 /*
    2770             :                  * If the task is running, it must be running on this CPU,
    2771             :                  * otherwise we cannot reprogram things.
    2772             :                  *
    2773             :                  * If its not running, we don't care, ctx->lock will
    2774             :                  * serialize against it becoming runnable.
    2775             :                  */
    2776           0 :                 if (task_curr(ctx->task) && !reprogram) {
    2777           0 :                         ret = -ESRCH;
    2778           0 :                         goto unlock;
    2779             :                 }
    2780             : 
    2781           0 :                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
    2782           0 :         } else if (task_ctx) {
    2783           0 :                 raw_spin_lock(&task_ctx->lock);
    2784             :         }
    2785             : 
    2786             : #ifdef CONFIG_CGROUP_PERF
    2787             :         if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
    2788             :                 /*
    2789             :                  * If the current cgroup doesn't match the event's
    2790             :                  * cgroup, we should not try to schedule it.
    2791             :                  */
    2792             :                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
    2793             :                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
    2794             :                                         event->cgrp->css.cgroup);
    2795             :         }
    2796             : #endif
    2797             : 
    2798           0 :         if (reprogram) {
    2799           0 :                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
    2800           0 :                 add_event_to_ctx(event, ctx);
    2801           0 :                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
    2802             :         } else {
    2803           0 :                 add_event_to_ctx(event, ctx);
    2804             :         }
    2805             : 
    2806           0 : unlock:
    2807           0 :         perf_ctx_unlock(cpuctx, task_ctx);
    2808             : 
    2809           0 :         return ret;
    2810             : }
    2811             : 
    2812             : static bool exclusive_event_installable(struct perf_event *event,
    2813             :                                         struct perf_event_context *ctx);
    2814             : 
    2815             : /*
    2816             :  * Attach a performance event to a context.
    2817             :  *
    2818             :  * Very similar to event_function_call, see comment there.
    2819             :  */
    2820             : static void
    2821           0 : perf_install_in_context(struct perf_event_context *ctx,
    2822             :                         struct perf_event *event,
    2823             :                         int cpu)
    2824             : {
    2825           0 :         struct task_struct *task = READ_ONCE(ctx->task);
    2826             : 
    2827           0 :         lockdep_assert_held(&ctx->mutex);
    2828             : 
    2829           0 :         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
    2830             : 
    2831           0 :         if (event->cpu != -1)
    2832           0 :                 event->cpu = cpu;
    2833             : 
    2834             :         /*
    2835             :          * Ensures that if we can observe event->ctx, both the event and ctx
    2836             :          * will be 'complete'. See perf_iterate_sb_cpu().
    2837             :          */
    2838           0 :         smp_store_release(&event->ctx, ctx);
    2839             : 
    2840             :         /*
    2841             :          * perf_event_attr::disabled events will not run and can be initialized
    2842             :          * without IPI. Except when this is the first event for the context, in
    2843             :          * that case we need the magic of the IPI to set ctx->is_active.
    2844             :          *
    2845             :          * The IOC_ENABLE that is sure to follow the creation of a disabled
    2846             :          * event will issue the IPI and reprogram the hardware.
    2847             :          */
    2848           0 :         if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF && ctx->nr_events) {
    2849           0 :                 raw_spin_lock_irq(&ctx->lock);
    2850           0 :                 if (ctx->task == TASK_TOMBSTONE) {
    2851           0 :                         raw_spin_unlock_irq(&ctx->lock);
    2852           0 :                         return;
    2853             :                 }
    2854           0 :                 add_event_to_ctx(event, ctx);
    2855           0 :                 raw_spin_unlock_irq(&ctx->lock);
    2856           0 :                 return;
    2857             :         }
    2858             : 
    2859           0 :         if (!task) {
    2860           0 :                 cpu_function_call(cpu, __perf_install_in_context, event);
    2861           0 :                 return;
    2862             :         }
    2863             : 
    2864             :         /*
    2865             :          * Should not happen, we validate the ctx is still alive before calling.
    2866             :          */
    2867           0 :         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
    2868             :                 return;
    2869             : 
    2870             :         /*
    2871             :          * Installing events is tricky because we cannot rely on ctx->is_active
    2872             :          * to be set in case this is the nr_events 0 -> 1 transition.
    2873             :          *
    2874             :          * Instead we use task_curr(), which tells us if the task is running.
    2875             :          * However, since we use task_curr() outside of rq::lock, we can race
    2876             :          * against the actual state. This means the result can be wrong.
    2877             :          *
    2878             :          * If we get a false positive, we retry, this is harmless.
    2879             :          *
    2880             :          * If we get a false negative, things are complicated. If we are after
    2881             :          * perf_event_context_sched_in() ctx::lock will serialize us, and the
    2882             :          * value must be correct. If we're before, it doesn't matter since
    2883             :          * perf_event_context_sched_in() will program the counter.
    2884             :          *
    2885             :          * However, this hinges on the remote context switch having observed
    2886             :          * our task->perf_event_ctxp[] store, such that it will in fact take
    2887             :          * ctx::lock in perf_event_context_sched_in().
    2888             :          *
    2889             :          * We do this by task_function_call(), if the IPI fails to hit the task
    2890             :          * we know any future context switch of task must see the
    2891             :          * perf_event_ctpx[] store.
    2892             :          */
    2893             : 
    2894             :         /*
    2895             :          * This smp_mb() orders the task->perf_event_ctxp[] store with the
    2896             :          * task_cpu() load, such that if the IPI then does not find the task
    2897             :          * running, a future context switch of that task must observe the
    2898             :          * store.
    2899             :          */
    2900           0 :         smp_mb();
    2901           0 : again:
    2902           0 :         if (!task_function_call(task, __perf_install_in_context, event))
    2903             :                 return;
    2904             : 
    2905           0 :         raw_spin_lock_irq(&ctx->lock);
    2906           0 :         task = ctx->task;
    2907           0 :         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
    2908             :                 /*
    2909             :                  * Cannot happen because we already checked above (which also
    2910             :                  * cannot happen), and we hold ctx->mutex, which serializes us
    2911             :                  * against perf_event_exit_task_context().
    2912             :                  */
    2913           0 :                 raw_spin_unlock_irq(&ctx->lock);
    2914           0 :                 return;
    2915             :         }
    2916             :         /*
    2917             :          * If the task is not running, ctx->lock will avoid it becoming so,
    2918             :          * thus we can safely install the event.
    2919             :          */
    2920           0 :         if (task_curr(task)) {
    2921           0 :                 raw_spin_unlock_irq(&ctx->lock);
    2922           0 :                 goto again;
    2923             :         }
    2924           0 :         add_event_to_ctx(event, ctx);
    2925           0 :         raw_spin_unlock_irq(&ctx->lock);
    2926             : }
    2927             : 
    2928             : /*
    2929             :  * Cross CPU call to enable a performance event
    2930             :  */
    2931           0 : static void __perf_event_enable(struct perf_event *event,
    2932             :                                 struct perf_cpu_context *cpuctx,
    2933             :                                 struct perf_event_context *ctx,
    2934             :                                 void *info)
    2935             : {
    2936           0 :         struct perf_event *leader = event->group_leader;
    2937           0 :         struct perf_event_context *task_ctx;
    2938             : 
    2939           0 :         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
    2940             :             event->state <= PERF_EVENT_STATE_ERROR)
    2941             :                 return;
    2942             : 
    2943           0 :         if (ctx->is_active)
    2944           0 :                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
    2945             : 
    2946           0 :         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
    2947           0 :         perf_cgroup_event_enable(event, ctx);
    2948             : 
    2949           0 :         if (!ctx->is_active)
    2950             :                 return;
    2951             : 
    2952           0 :         if (!event_filter_match(event)) {
    2953           0 :                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
    2954           0 :                 return;
    2955             :         }
    2956             : 
    2957             :         /*
    2958             :          * If the event is in a group and isn't the group leader,
    2959             :          * then don't put it on unless the group is on.
    2960             :          */
    2961           0 :         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
    2962           0 :                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
    2963           0 :                 return;
    2964             :         }
    2965             : 
    2966           0 :         task_ctx = cpuctx->task_ctx;
    2967           0 :         if (ctx->task)
    2968           0 :                 WARN_ON_ONCE(task_ctx != ctx);
    2969             : 
    2970           0 :         ctx_resched(cpuctx, task_ctx, get_event_type(event));
    2971             : }
    2972             : 
    2973             : /*
    2974             :  * Enable an event.
    2975             :  *
    2976             :  * If event->ctx is a cloned context, callers must make sure that
    2977             :  * every task struct that event->ctx->task could possibly point to
    2978             :  * remains valid.  This condition is satisfied when called through
    2979             :  * perf_event_for_each_child or perf_event_for_each as described
    2980             :  * for perf_event_disable.
    2981             :  */
    2982           0 : static void _perf_event_enable(struct perf_event *event)
    2983             : {
    2984           0 :         struct perf_event_context *ctx = event->ctx;
    2985             : 
    2986           0 :         raw_spin_lock_irq(&ctx->lock);
    2987           0 :         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
    2988             :             event->state <  PERF_EVENT_STATE_ERROR) {
    2989           0 : out:
    2990           0 :                 raw_spin_unlock_irq(&ctx->lock);
    2991           0 :                 return;
    2992             :         }
    2993             : 
    2994             :         /*
    2995             :          * If the event is in error state, clear that first.
    2996             :          *
    2997             :          * That way, if we see the event in error state below, we know that it
    2998             :          * has gone back into error state, as distinct from the task having
    2999             :          * been scheduled away before the cross-call arrived.
    3000             :          */
    3001           0 :         if (event->state == PERF_EVENT_STATE_ERROR) {
    3002             :                 /*
    3003             :                  * Detached SIBLING events cannot leave ERROR state.
    3004             :                  */
    3005           0 :                 if (event->event_caps & PERF_EV_CAP_SIBLING &&
    3006           0 :                     event->group_leader == event)
    3007           0 :                         goto out;
    3008             : 
    3009           0 :                 event->state = PERF_EVENT_STATE_OFF;
    3010             :         }
    3011           0 :         raw_spin_unlock_irq(&ctx->lock);
    3012             : 
    3013           0 :         event_function_call(event, __perf_event_enable, NULL);
    3014             : }
    3015             : 
    3016             : /*
    3017             :  * See perf_event_disable();
    3018             :  */
    3019           0 : void perf_event_enable(struct perf_event *event)
    3020             : {
    3021           0 :         struct perf_event_context *ctx;
    3022             : 
    3023           0 :         ctx = perf_event_ctx_lock(event);
    3024           0 :         _perf_event_enable(event);
    3025           0 :         perf_event_ctx_unlock(event, ctx);
    3026           0 : }
    3027             : EXPORT_SYMBOL_GPL(perf_event_enable);
    3028             : 
    3029             : struct stop_event_data {
    3030             :         struct perf_event       *event;
    3031             :         unsigned int            restart;
    3032             : };
    3033             : 
    3034           0 : static int __perf_event_stop(void *info)
    3035             : {
    3036           0 :         struct stop_event_data *sd = info;
    3037           0 :         struct perf_event *event = sd->event;
    3038             : 
    3039             :         /* if it's already INACTIVE, do nothing */
    3040           0 :         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
    3041             :                 return 0;
    3042             : 
    3043             :         /* matches smp_wmb() in event_sched_in() */
    3044           0 :         smp_rmb();
    3045             : 
    3046             :         /*
    3047             :          * There is a window with interrupts enabled before we get here,
    3048             :          * so we need to check again lest we try to stop another CPU's event.
    3049             :          */
    3050           0 :         if (READ_ONCE(event->oncpu) != smp_processor_id())
    3051             :                 return -EAGAIN;
    3052             : 
    3053           0 :         event->pmu->stop(event, PERF_EF_UPDATE);
    3054             : 
    3055             :         /*
    3056             :          * May race with the actual stop (through perf_pmu_output_stop()),
    3057             :          * but it is only used for events with AUX ring buffer, and such
    3058             :          * events will refuse to restart because of rb::aux_mmap_count==0,
    3059             :          * see comments in perf_aux_output_begin().
    3060             :          *
    3061             :          * Since this is happening on an event-local CPU, no trace is lost
    3062             :          * while restarting.
    3063             :          */
    3064           0 :         if (sd->restart)
    3065           0 :                 event->pmu->start(event, 0);
    3066             : 
    3067             :         return 0;
    3068             : }
    3069             : 
    3070           0 : static int perf_event_stop(struct perf_event *event, int restart)
    3071             : {
    3072           0 :         struct stop_event_data sd = {
    3073             :                 .event          = event,
    3074             :                 .restart        = restart,
    3075             :         };
    3076           0 :         int ret = 0;
    3077             : 
    3078           0 :         do {
    3079           0 :                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
    3080             :                         return 0;
    3081             : 
    3082             :                 /* matches smp_wmb() in event_sched_in() */
    3083           0 :                 smp_rmb();
    3084             : 
    3085             :                 /*
    3086             :                  * We only want to restart ACTIVE events, so if the event goes
    3087             :                  * inactive here (event->oncpu==-1), there's nothing more to do;
    3088             :                  * fall through with ret==-ENXIO.
    3089             :                  */
    3090           0 :                 ret = cpu_function_call(READ_ONCE(event->oncpu),
    3091             :                                         __perf_event_stop, &sd);
    3092           0 :         } while (ret == -EAGAIN);
    3093             : 
    3094             :         return ret;
    3095             : }
    3096             : 
    3097             : /*
    3098             :  * In order to contain the amount of racy and tricky in the address filter
    3099             :  * configuration management, it is a two part process:
    3100             :  *
    3101             :  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
    3102             :  *      we update the addresses of corresponding vmas in
    3103             :  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
    3104             :  * (p2) when an event is scheduled in (pmu::add), it calls
    3105             :  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
    3106             :  *      if the generation has changed since the previous call.
    3107             :  *
    3108             :  * If (p1) happens while the event is active, we restart it to force (p2).
    3109             :  *
    3110             :  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
    3111             :  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
    3112             :  *     ioctl;
    3113             :  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
    3114             :  *     registered mapping, called for every new mmap(), with mm::mmap_lock down
    3115             :  *     for reading;
    3116             :  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
    3117             :  *     of exec.
    3118             :  */
    3119           0 : void perf_event_addr_filters_sync(struct perf_event *event)
    3120             : {
    3121           0 :         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
    3122             : 
    3123           0 :         if (!has_addr_filter(event))
    3124             :                 return;
    3125             : 
    3126           0 :         raw_spin_lock(&ifh->lock);
    3127           0 :         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
    3128           0 :                 event->pmu->addr_filters_sync(event);
    3129           0 :                 event->hw.addr_filters_gen = event->addr_filters_gen;
    3130             :         }
    3131           0 :         raw_spin_unlock(&ifh->lock);
    3132             : }
    3133             : EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
    3134             : 
    3135           0 : static int _perf_event_refresh(struct perf_event *event, int refresh)
    3136             : {
    3137             :         /*
    3138             :          * not supported on inherited events
    3139             :          */
    3140           0 :         if (event->attr.inherit || !is_sampling_event(event))
    3141             :                 return -EINVAL;
    3142             : 
    3143           0 :         atomic_add(refresh, &event->event_limit);
    3144           0 :         _perf_event_enable(event);
    3145             : 
    3146           0 :         return 0;
    3147             : }
    3148             : 
    3149             : /*
    3150             :  * See perf_event_disable()
    3151             :  */
    3152           0 : int perf_event_refresh(struct perf_event *event, int refresh)
    3153             : {
    3154           0 :         struct perf_event_context *ctx;
    3155           0 :         int ret;
    3156             : 
    3157           0 :         ctx = perf_event_ctx_lock(event);
    3158           0 :         ret = _perf_event_refresh(event, refresh);
    3159           0 :         perf_event_ctx_unlock(event, ctx);
    3160             : 
    3161           0 :         return ret;
    3162             : }
    3163             : EXPORT_SYMBOL_GPL(perf_event_refresh);
    3164             : 
    3165           0 : static int perf_event_modify_breakpoint(struct perf_event *bp,
    3166             :                                          struct perf_event_attr *attr)
    3167             : {
    3168           0 :         int err;
    3169             : 
    3170           0 :         _perf_event_disable(bp);
    3171             : 
    3172           0 :         err = modify_user_hw_breakpoint_check(bp, attr, true);
    3173             : 
    3174           0 :         if (!bp->attr.disabled)
    3175           0 :                 _perf_event_enable(bp);
    3176             : 
    3177           0 :         return err;
    3178             : }
    3179             : 
    3180           0 : static int perf_event_modify_attr(struct perf_event *event,
    3181             :                                   struct perf_event_attr *attr)
    3182             : {
    3183           0 :         if (event->attr.type != attr->type)
    3184             :                 return -EINVAL;
    3185             : 
    3186           0 :         switch (event->attr.type) {
    3187           0 :         case PERF_TYPE_BREAKPOINT:
    3188           0 :                 return perf_event_modify_breakpoint(event, attr);
    3189             :         default:
    3190             :                 /* Place holder for future additions. */
    3191             :                 return -EOPNOTSUPP;
    3192             :         }
    3193             : }
    3194             : 
    3195           0 : static void ctx_sched_out(struct perf_event_context *ctx,
    3196             :                           struct perf_cpu_context *cpuctx,
    3197             :                           enum event_type_t event_type)
    3198             : {
    3199           0 :         struct perf_event *event, *tmp;
    3200           0 :         int is_active = ctx->is_active;
    3201             : 
    3202           0 :         lockdep_assert_held(&ctx->lock);
    3203             : 
    3204           0 :         if (likely(!ctx->nr_events)) {
    3205             :                 /*
    3206             :                  * See __perf_remove_from_context().
    3207             :                  */
    3208           0 :                 WARN_ON_ONCE(ctx->is_active);
    3209           0 :                 if (ctx->task)
    3210           0 :                         WARN_ON_ONCE(cpuctx->task_ctx);
    3211             :                 return;
    3212             :         }
    3213             : 
    3214           0 :         ctx->is_active &= ~event_type;
    3215           0 :         if (!(ctx->is_active & EVENT_ALL))
    3216           0 :                 ctx->is_active = 0;
    3217             : 
    3218           0 :         if (ctx->task) {
    3219           0 :                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
    3220           0 :                 if (!ctx->is_active)
    3221           0 :                         cpuctx->task_ctx = NULL;
    3222             :         }
    3223             : 
    3224             :         /*
    3225             :          * Always update time if it was set; not only when it changes.
    3226             :          * Otherwise we can 'forget' to update time for any but the last
    3227             :          * context we sched out. For example:
    3228             :          *
    3229             :          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
    3230             :          *   ctx_sched_out(.event_type = EVENT_PINNED)
    3231             :          *
    3232             :          * would only update time for the pinned events.
    3233             :          */
    3234           0 :         if (is_active & EVENT_TIME) {
    3235             :                 /* update (and stop) ctx time */
    3236           0 :                 update_context_time(ctx);
    3237           0 :                 update_cgrp_time_from_cpuctx(cpuctx);
    3238             :         }
    3239             : 
    3240           0 :         is_active ^= ctx->is_active; /* changed bits */
    3241             : 
    3242           0 :         if (!ctx->nr_active || !(is_active & EVENT_ALL))
    3243             :                 return;
    3244             : 
    3245           0 :         perf_pmu_disable(ctx->pmu);
    3246           0 :         if (is_active & EVENT_PINNED) {
    3247           0 :                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
    3248           0 :                         group_sched_out(event, cpuctx, ctx);
    3249             :         }
    3250             : 
    3251           0 :         if (is_active & EVENT_FLEXIBLE) {
    3252           0 :                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
    3253           0 :                         group_sched_out(event, cpuctx, ctx);
    3254             : 
    3255             :                 /*
    3256             :                  * Since we cleared EVENT_FLEXIBLE, also clear
    3257             :                  * rotate_necessary, is will be reset by
    3258             :                  * ctx_flexible_sched_in() when needed.
    3259             :                  */
    3260           0 :                 ctx->rotate_necessary = 0;
    3261             :         }
    3262           0 :         perf_pmu_enable(ctx->pmu);
    3263             : }
    3264             : 
    3265             : /*
    3266             :  * Test whether two contexts are equivalent, i.e. whether they have both been
    3267             :  * cloned from the same version of the same context.
    3268             :  *
    3269             :  * Equivalence is measured using a generation number in the context that is
    3270             :  * incremented on each modification to it; see unclone_ctx(), list_add_event()
    3271             :  * and list_del_event().
    3272             :  */
    3273           0 : static int context_equiv(struct perf_event_context *ctx1,
    3274             :                          struct perf_event_context *ctx2)
    3275             : {
    3276           0 :         lockdep_assert_held(&ctx1->lock);
    3277           0 :         lockdep_assert_held(&ctx2->lock);
    3278             : 
    3279             :         /* Pinning disables the swap optimization */
    3280           0 :         if (ctx1->pin_count || ctx2->pin_count)
    3281             :                 return 0;
    3282             : 
    3283             :         /* If ctx1 is the parent of ctx2 */
    3284           0 :         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
    3285             :                 return 1;
    3286             : 
    3287             :         /* If ctx2 is the parent of ctx1 */
    3288           0 :         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
    3289             :                 return 1;
    3290             : 
    3291             :         /*
    3292             :          * If ctx1 and ctx2 have the same parent; we flatten the parent
    3293             :          * hierarchy, see perf_event_init_context().
    3294             :          */
    3295           0 :         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
    3296           0 :                         ctx1->parent_gen == ctx2->parent_gen)
    3297           0 :                 return 1;
    3298             : 
    3299             :         /* Unmatched */
    3300             :         return 0;
    3301             : }
    3302             : 
    3303           0 : static void __perf_event_sync_stat(struct perf_event *event,
    3304             :                                      struct perf_event *next_event)
    3305             : {
    3306           0 :         u64 value;
    3307             : 
    3308           0 :         if (!event->attr.inherit_stat)
    3309             :                 return;
    3310             : 
    3311             :         /*
    3312             :          * Update the event value, we cannot use perf_event_read()
    3313             :          * because we're in the middle of a context switch and have IRQs
    3314             :          * disabled, which upsets smp_call_function_single(), however
    3315             :          * we know the event must be on the current CPU, therefore we
    3316             :          * don't need to use it.
    3317             :          */
    3318           0 :         if (event->state == PERF_EVENT_STATE_ACTIVE)
    3319           0 :                 event->pmu->read(event);
    3320             : 
    3321           0 :         perf_event_update_time(event);
    3322             : 
    3323             :         /*
    3324             :          * In order to keep per-task stats reliable we need to flip the event
    3325             :          * values when we flip the contexts.
    3326             :          */
    3327           0 :         value = local64_read(&next_event->count);
    3328           0 :         value = local64_xchg(&event->count, value);
    3329           0 :         local64_set(&next_event->count, value);
    3330             : 
    3331           0 :         swap(event->total_time_enabled, next_event->total_time_enabled);
    3332           0 :         swap(event->total_time_running, next_event->total_time_running);
    3333             : 
    3334             :         /*
    3335             :          * Since we swizzled the values, update the user visible data too.
    3336             :          */
    3337           0 :         perf_event_update_userpage(event);
    3338           0 :         perf_event_update_userpage(next_event);
    3339             : }
    3340             : 
    3341           0 : static void perf_event_sync_stat(struct perf_event_context *ctx,
    3342             :                                    struct perf_event_context *next_ctx)
    3343             : {
    3344           0 :         struct perf_event *event, *next_event;
    3345             : 
    3346           0 :         if (!ctx->nr_stat)
    3347             :                 return;
    3348             : 
    3349           0 :         update_context_time(ctx);
    3350             : 
    3351           0 :         event = list_first_entry(&ctx->event_list,
    3352             :                                    struct perf_event, event_entry);
    3353             : 
    3354           0 :         next_event = list_first_entry(&next_ctx->event_list,
    3355             :                                         struct perf_event, event_entry);
    3356             : 
    3357           0 :         while (&event->event_entry != &ctx->event_list &&
    3358           0 :                &next_event->event_entry != &next_ctx->event_list) {
    3359             : 
    3360           0 :                 __perf_event_sync_stat(event, next_event);
    3361             : 
    3362           0 :                 event = list_next_entry(event, event_entry);
    3363           0 :                 next_event = list_next_entry(next_event, event_entry);
    3364             :         }
    3365             : }
    3366             : 
    3367           0 : static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
    3368             :                                          struct task_struct *next)
    3369             : {
    3370           0 :         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
    3371           0 :         struct perf_event_context *next_ctx;
    3372           0 :         struct perf_event_context *parent, *next_parent;
    3373           0 :         struct perf_cpu_context *cpuctx;
    3374           0 :         int do_switch = 1;
    3375           0 :         struct pmu *pmu;
    3376             : 
    3377           0 :         if (likely(!ctx))
    3378             :                 return;
    3379             : 
    3380           0 :         pmu = ctx->pmu;
    3381           0 :         cpuctx = __get_cpu_context(ctx);
    3382           0 :         if (!cpuctx->task_ctx)
    3383             :                 return;
    3384             : 
    3385           0 :         rcu_read_lock();
    3386           0 :         next_ctx = next->perf_event_ctxp[ctxn];
    3387           0 :         if (!next_ctx)
    3388           0 :                 goto unlock;
    3389             : 
    3390           0 :         parent = rcu_dereference(ctx->parent_ctx);
    3391           0 :         next_parent = rcu_dereference(next_ctx->parent_ctx);
    3392             : 
    3393             :         /* If neither context have a parent context; they cannot be clones. */
    3394           0 :         if (!parent && !next_parent)
    3395           0 :                 goto unlock;
    3396             : 
    3397           0 :         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
    3398             :                 /*
    3399             :                  * Looks like the two contexts are clones, so we might be
    3400             :                  * able to optimize the context switch.  We lock both
    3401             :                  * contexts and check that they are clones under the
    3402             :                  * lock (including re-checking that neither has been
    3403             :                  * uncloned in the meantime).  It doesn't matter which
    3404             :                  * order we take the locks because no other cpu could
    3405             :                  * be trying to lock both of these tasks.
    3406             :                  */
    3407           0 :                 raw_spin_lock(&ctx->lock);
    3408           0 :                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
    3409           0 :                 if (context_equiv(ctx, next_ctx)) {
    3410             : 
    3411           0 :                         WRITE_ONCE(ctx->task, next);
    3412           0 :                         WRITE_ONCE(next_ctx->task, task);
    3413             : 
    3414           0 :                         perf_pmu_disable(pmu);
    3415             : 
    3416           0 :                         if (cpuctx->sched_cb_usage && pmu->sched_task)
    3417           0 :                                 pmu->sched_task(ctx, false);
    3418             : 
    3419             :                         /*
    3420             :                          * PMU specific parts of task perf context can require
    3421             :                          * additional synchronization. As an example of such
    3422             :                          * synchronization see implementation details of Intel
    3423             :                          * LBR call stack data profiling;
    3424             :                          */
    3425           0 :                         if (pmu->swap_task_ctx)
    3426           0 :                                 pmu->swap_task_ctx(ctx, next_ctx);
    3427             :                         else
    3428           0 :                                 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
    3429             : 
    3430           0 :                         perf_pmu_enable(pmu);
    3431             : 
    3432             :                         /*
    3433             :                          * RCU_INIT_POINTER here is safe because we've not
    3434             :                          * modified the ctx and the above modification of
    3435             :                          * ctx->task and ctx->task_ctx_data are immaterial
    3436             :                          * since those values are always verified under
    3437             :                          * ctx->lock which we're now holding.
    3438             :                          */
    3439           0 :                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
    3440           0 :                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
    3441             : 
    3442           0 :                         do_switch = 0;
    3443             : 
    3444           0 :                         perf_event_sync_stat(ctx, next_ctx);
    3445             :                 }
    3446           0 :                 raw_spin_unlock(&next_ctx->lock);
    3447           0 :                 raw_spin_unlock(&ctx->lock);
    3448             :         }
    3449           0 : unlock:
    3450           0 :         rcu_read_unlock();
    3451             : 
    3452           0 :         if (do_switch) {
    3453           0 :                 raw_spin_lock(&ctx->lock);
    3454           0 :                 perf_pmu_disable(pmu);
    3455             : 
    3456           0 :                 if (cpuctx->sched_cb_usage && pmu->sched_task)
    3457           0 :                         pmu->sched_task(ctx, false);
    3458           0 :                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
    3459             : 
    3460           0 :                 perf_pmu_enable(pmu);
    3461           0 :                 raw_spin_unlock(&ctx->lock);
    3462             :         }
    3463             : }
    3464             : 
    3465             : static DEFINE_PER_CPU(struct list_head, sched_cb_list);
    3466             : 
    3467           0 : void perf_sched_cb_dec(struct pmu *pmu)
    3468             : {
    3469           0 :         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
    3470             : 
    3471           0 :         this_cpu_dec(perf_sched_cb_usages);
    3472             : 
    3473           0 :         if (!--cpuctx->sched_cb_usage)
    3474           0 :                 list_del(&cpuctx->sched_cb_entry);
    3475           0 : }
    3476             : 
    3477             : 
    3478           0 : void perf_sched_cb_inc(struct pmu *pmu)
    3479             : {
    3480           0 :         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
    3481             : 
    3482           0 :         if (!cpuctx->sched_cb_usage++)
    3483           0 :                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
    3484             : 
    3485           0 :         this_cpu_inc(perf_sched_cb_usages);
    3486           0 : }
    3487             : 
    3488             : /*
    3489             :  * This function provides the context switch callback to the lower code
    3490             :  * layer. It is invoked ONLY when the context switch callback is enabled.
    3491             :  *
    3492             :  * This callback is relevant even to per-cpu events; for example multi event
    3493             :  * PEBS requires this to provide PID/TID information. This requires we flush
    3494             :  * all queued PEBS records before we context switch to a new task.
    3495             :  */
    3496           0 : static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in)
    3497             : {
    3498           0 :         struct pmu *pmu;
    3499             : 
    3500           0 :         pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
    3501             : 
    3502           0 :         if (WARN_ON_ONCE(!pmu->sched_task))
    3503             :                 return;
    3504             : 
    3505           0 :         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
    3506           0 :         perf_pmu_disable(pmu);
    3507             : 
    3508           0 :         pmu->sched_task(cpuctx->task_ctx, sched_in);
    3509             : 
    3510           0 :         perf_pmu_enable(pmu);
    3511           0 :         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
    3512             : }
    3513             : 
    3514           0 : static void perf_pmu_sched_task(struct task_struct *prev,
    3515             :                                 struct task_struct *next,
    3516             :                                 bool sched_in)
    3517             : {
    3518           0 :         struct perf_cpu_context *cpuctx;
    3519             : 
    3520           0 :         if (prev == next)
    3521             :                 return;
    3522             : 
    3523           0 :         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
    3524             :                 /* will be handled in perf_event_context_sched_in/out */
    3525           0 :                 if (cpuctx->task_ctx)
    3526           0 :                         continue;
    3527             : 
    3528           0 :                 __perf_pmu_sched_task(cpuctx, sched_in);
    3529             :         }
    3530             : }
    3531             : 
    3532             : static void perf_event_switch(struct task_struct *task,
    3533             :                               struct task_struct *next_prev, bool sched_in);
    3534             : 
    3535             : #define for_each_task_context_nr(ctxn)                                  \
    3536             :         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
    3537             : 
    3538             : /*
    3539             :  * Called from scheduler to remove the events of the current task,
    3540             :  * with interrupts disabled.
    3541             :  *
    3542             :  * We stop each event and update the event value in event->count.
    3543             :  *
    3544             :  * This does not protect us against NMI, but disable()
    3545             :  * sets the disabled bit in the control field of event _before_
    3546             :  * accessing the event control register. If a NMI hits, then it will
    3547             :  * not restart the event.
    3548             :  */
    3549           0 : void __perf_event_task_sched_out(struct task_struct *task,
    3550             :                                  struct task_struct *next)
    3551             : {
    3552           0 :         int ctxn;
    3553             : 
    3554           0 :         if (__this_cpu_read(perf_sched_cb_usages))
    3555           0 :                 perf_pmu_sched_task(task, next, false);
    3556             : 
    3557           0 :         if (atomic_read(&nr_switch_events))
    3558           0 :                 perf_event_switch(task, next, false);
    3559             : 
    3560           0 :         for_each_task_context_nr(ctxn)
    3561           0 :                 perf_event_context_sched_out(task, ctxn, next);
    3562             : 
    3563             :         /*
    3564             :          * if cgroup events exist on this CPU, then we need
    3565             :          * to check if we have to switch out PMU state.
    3566             :          * cgroup event are system-wide mode only
    3567             :          */
    3568           0 :         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
    3569           0 :                 perf_cgroup_sched_out(task, next);
    3570           0 : }
    3571             : 
    3572             : /*
    3573             :  * Called with IRQs disabled
    3574             :  */
    3575           0 : static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
    3576             :                               enum event_type_t event_type)
    3577             : {
    3578           0 :         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
    3579           0 : }
    3580             : 
    3581           0 : static bool perf_less_group_idx(const void *l, const void *r)
    3582             : {
    3583           0 :         const struct perf_event *le = *(const struct perf_event **)l;
    3584           0 :         const struct perf_event *re = *(const struct perf_event **)r;
    3585             : 
    3586           0 :         return le->group_index < re->group_index;
    3587             : }
    3588             : 
    3589           0 : static void swap_ptr(void *l, void *r)
    3590             : {
    3591           0 :         void **lp = l, **rp = r;
    3592             : 
    3593           0 :         swap(*lp, *rp);
    3594           0 : }
    3595             : 
    3596             : static const struct min_heap_callbacks perf_min_heap = {
    3597             :         .elem_size = sizeof(struct perf_event *),
    3598             :         .less = perf_less_group_idx,
    3599             :         .swp = swap_ptr,
    3600             : };
    3601             : 
    3602           0 : static void __heap_add(struct min_heap *heap, struct perf_event *event)
    3603             : {
    3604           0 :         struct perf_event **itrs = heap->data;
    3605             : 
    3606           0 :         if (event) {
    3607           0 :                 itrs[heap->nr] = event;
    3608           0 :                 heap->nr++;
    3609             :         }
    3610             : }
    3611             : 
    3612           0 : static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
    3613             :                                 struct perf_event_groups *groups, int cpu,
    3614             :                                 int (*func)(struct perf_event *, void *),
    3615             :                                 void *data)
    3616             : {
    3617             : #ifdef CONFIG_CGROUP_PERF
    3618             :         struct cgroup_subsys_state *css = NULL;
    3619             : #endif
    3620             :         /* Space for per CPU and/or any CPU event iterators. */
    3621           0 :         struct perf_event *itrs[2];
    3622           0 :         struct min_heap event_heap;
    3623           0 :         struct perf_event **evt;
    3624           0 :         int ret;
    3625             : 
    3626           0 :         if (cpuctx) {
    3627           0 :                 event_heap = (struct min_heap){
    3628           0 :                         .data = cpuctx->heap,
    3629             :                         .nr = 0,
    3630           0 :                         .size = cpuctx->heap_size,
    3631             :                 };
    3632             : 
    3633           0 :                 lockdep_assert_held(&cpuctx->ctx.lock);
    3634             : 
    3635             : #ifdef CONFIG_CGROUP_PERF
    3636             :                 if (cpuctx->cgrp)
    3637             :                         css = &cpuctx->cgrp->css;
    3638             : #endif
    3639             :         } else {
    3640           0 :                 event_heap = (struct min_heap){
    3641             :                         .data = itrs,
    3642             :                         .nr = 0,
    3643             :                         .size = ARRAY_SIZE(itrs),
    3644             :                 };
    3645             :                 /* Events not within a CPU context may be on any CPU. */
    3646           0 :                 __heap_add(&event_heap, perf_event_groups_first(groups, -1, NULL));
    3647             :         }
    3648           0 :         evt = event_heap.data;
    3649             : 
    3650           0 :         __heap_add(&event_heap, perf_event_groups_first(groups, cpu, NULL));
    3651             : 
    3652             : #ifdef CONFIG_CGROUP_PERF
    3653             :         for (; css; css = css->parent)
    3654             :                 __heap_add(&event_heap, perf_event_groups_first(groups, cpu, css->cgroup));
    3655             : #endif
    3656             : 
    3657           0 :         min_heapify_all(&event_heap, &perf_min_heap);
    3658             : 
    3659           0 :         while (event_heap.nr) {
    3660           0 :                 ret = func(*evt, data);
    3661           0 :                 if (ret)
    3662           0 :                         return ret;
    3663             : 
    3664           0 :                 *evt = perf_event_groups_next(*evt);
    3665           0 :                 if (*evt)
    3666           0 :                         min_heapify(&event_heap, 0, &perf_min_heap);
    3667             :                 else
    3668           0 :                         min_heap_pop(&event_heap, &perf_min_heap);
    3669             :         }
    3670             : 
    3671             :         return 0;
    3672             : }
    3673             : 
    3674           0 : static int merge_sched_in(struct perf_event *event, void *data)
    3675             : {
    3676           0 :         struct perf_event_context *ctx = event->ctx;
    3677           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    3678           0 :         int *can_add_hw = data;
    3679             : 
    3680           0 :         if (event->state <= PERF_EVENT_STATE_OFF)
    3681             :                 return 0;
    3682             : 
    3683           0 :         if (!event_filter_match(event))
    3684             :                 return 0;
    3685             : 
    3686           0 :         if (group_can_go_on(event, cpuctx, *can_add_hw)) {
    3687           0 :                 if (!group_sched_in(event, cpuctx, ctx))
    3688           0 :                         list_add_tail(&event->active_list, get_event_list(event));
    3689             :         }
    3690             : 
    3691           0 :         if (event->state == PERF_EVENT_STATE_INACTIVE) {
    3692           0 :                 if (event->attr.pinned) {
    3693           0 :                         perf_cgroup_event_disable(event, ctx);
    3694           0 :                         perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
    3695             :                 }
    3696             : 
    3697           0 :                 *can_add_hw = 0;
    3698           0 :                 ctx->rotate_necessary = 1;
    3699           0 :                 perf_mux_hrtimer_restart(cpuctx);
    3700             :         }
    3701             : 
    3702             :         return 0;
    3703             : }
    3704             : 
    3705             : static void
    3706           0 : ctx_pinned_sched_in(struct perf_event_context *ctx,
    3707             :                     struct perf_cpu_context *cpuctx)
    3708             : {
    3709           0 :         int can_add_hw = 1;
    3710             : 
    3711           0 :         if (ctx != &cpuctx->ctx)
    3712           0 :                 cpuctx = NULL;
    3713             : 
    3714           0 :         visit_groups_merge(cpuctx, &ctx->pinned_groups,
    3715           0 :                            smp_processor_id(),
    3716             :                            merge_sched_in, &can_add_hw);
    3717           0 : }
    3718             : 
    3719             : static void
    3720           0 : ctx_flexible_sched_in(struct perf_event_context *ctx,
    3721             :                       struct perf_cpu_context *cpuctx)
    3722             : {
    3723           0 :         int can_add_hw = 1;
    3724             : 
    3725           0 :         if (ctx != &cpuctx->ctx)
    3726           0 :                 cpuctx = NULL;
    3727             : 
    3728           0 :         visit_groups_merge(cpuctx, &ctx->flexible_groups,
    3729           0 :                            smp_processor_id(),
    3730             :                            merge_sched_in, &can_add_hw);
    3731           0 : }
    3732             : 
    3733             : static void
    3734           0 : ctx_sched_in(struct perf_event_context *ctx,
    3735             :              struct perf_cpu_context *cpuctx,
    3736             :              enum event_type_t event_type,
    3737             :              struct task_struct *task)
    3738             : {
    3739           0 :         int is_active = ctx->is_active;
    3740           0 :         u64 now;
    3741             : 
    3742           0 :         lockdep_assert_held(&ctx->lock);
    3743             : 
    3744           0 :         if (likely(!ctx->nr_events))
    3745             :                 return;
    3746             : 
    3747           0 :         ctx->is_active |= (event_type | EVENT_TIME);
    3748           0 :         if (ctx->task) {
    3749           0 :                 if (!is_active)
    3750           0 :                         cpuctx->task_ctx = ctx;
    3751             :                 else
    3752           0 :                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
    3753             :         }
    3754             : 
    3755           0 :         is_active ^= ctx->is_active; /* changed bits */
    3756             : 
    3757           0 :         if (is_active & EVENT_TIME) {
    3758             :                 /* start ctx time */
    3759           0 :                 now = perf_clock();
    3760           0 :                 ctx->timestamp = now;
    3761           0 :                 perf_cgroup_set_timestamp(task, ctx);
    3762             :         }
    3763             : 
    3764             :         /*
    3765             :          * First go through the list and put on any pinned groups
    3766             :          * in order to give them the best chance of going on.
    3767             :          */
    3768           0 :         if (is_active & EVENT_PINNED)
    3769           0 :                 ctx_pinned_sched_in(ctx, cpuctx);
    3770             : 
    3771             :         /* Then walk through the lower prio flexible groups */
    3772           0 :         if (is_active & EVENT_FLEXIBLE)
    3773           0 :                 ctx_flexible_sched_in(ctx, cpuctx);
    3774             : }
    3775             : 
    3776           0 : static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
    3777             :                              enum event_type_t event_type,
    3778             :                              struct task_struct *task)
    3779             : {
    3780           0 :         struct perf_event_context *ctx = &cpuctx->ctx;
    3781             : 
    3782           0 :         ctx_sched_in(ctx, cpuctx, event_type, task);
    3783             : }
    3784             : 
    3785           0 : static void perf_event_context_sched_in(struct perf_event_context *ctx,
    3786             :                                         struct task_struct *task)
    3787             : {
    3788           0 :         struct perf_cpu_context *cpuctx;
    3789           0 :         struct pmu *pmu = ctx->pmu;
    3790             : 
    3791           0 :         cpuctx = __get_cpu_context(ctx);
    3792           0 :         if (cpuctx->task_ctx == ctx) {
    3793           0 :                 if (cpuctx->sched_cb_usage)
    3794           0 :                         __perf_pmu_sched_task(cpuctx, true);
    3795           0 :                 return;
    3796             :         }
    3797             : 
    3798           0 :         perf_ctx_lock(cpuctx, ctx);
    3799             :         /*
    3800             :          * We must check ctx->nr_events while holding ctx->lock, such
    3801             :          * that we serialize against perf_install_in_context().
    3802             :          */
    3803           0 :         if (!ctx->nr_events)
    3804           0 :                 goto unlock;
    3805             : 
    3806           0 :         perf_pmu_disable(pmu);
    3807             :         /*
    3808             :          * We want to keep the following priority order:
    3809             :          * cpu pinned (that don't need to move), task pinned,
    3810             :          * cpu flexible, task flexible.
    3811             :          *
    3812             :          * However, if task's ctx is not carrying any pinned
    3813             :          * events, no need to flip the cpuctx's events around.
    3814             :          */
    3815           0 :         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
    3816           0 :                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
    3817           0 :         perf_event_sched_in(cpuctx, ctx, task);
    3818             : 
    3819           0 :         if (cpuctx->sched_cb_usage && pmu->sched_task)
    3820           0 :                 pmu->sched_task(cpuctx->task_ctx, true);
    3821             : 
    3822           0 :         perf_pmu_enable(pmu);
    3823             : 
    3824           0 : unlock:
    3825           0 :         perf_ctx_unlock(cpuctx, ctx);
    3826             : }
    3827             : 
    3828             : /*
    3829             :  * Called from scheduler to add the events of the current task
    3830             :  * with interrupts disabled.
    3831             :  *
    3832             :  * We restore the event value and then enable it.
    3833             :  *
    3834             :  * This does not protect us against NMI, but enable()
    3835             :  * sets the enabled bit in the control field of event _before_
    3836             :  * accessing the event control register. If a NMI hits, then it will
    3837             :  * keep the event running.
    3838             :  */
    3839           0 : void __perf_event_task_sched_in(struct task_struct *prev,
    3840             :                                 struct task_struct *task)
    3841             : {
    3842           0 :         struct perf_event_context *ctx;
    3843           0 :         int ctxn;
    3844             : 
    3845             :         /*
    3846             :          * If cgroup events exist on this CPU, then we need to check if we have
    3847             :          * to switch in PMU state; cgroup event are system-wide mode only.
    3848             :          *
    3849             :          * Since cgroup events are CPU events, we must schedule these in before
    3850             :          * we schedule in the task events.
    3851             :          */
    3852           0 :         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
    3853           0 :                 perf_cgroup_sched_in(prev, task);
    3854             : 
    3855           0 :         for_each_task_context_nr(ctxn) {
    3856           0 :                 ctx = task->perf_event_ctxp[ctxn];
    3857           0 :                 if (likely(!ctx))
    3858           0 :                         continue;
    3859             : 
    3860           0 :                 perf_event_context_sched_in(ctx, task);
    3861             :         }
    3862             : 
    3863           0 :         if (atomic_read(&nr_switch_events))
    3864           0 :                 perf_event_switch(task, prev, true);
    3865             : 
    3866           0 :         if (__this_cpu_read(perf_sched_cb_usages))
    3867           0 :                 perf_pmu_sched_task(prev, task, true);
    3868           0 : }
    3869             : 
    3870           0 : static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
    3871             : {
    3872           0 :         u64 frequency = event->attr.sample_freq;
    3873           0 :         u64 sec = NSEC_PER_SEC;
    3874           0 :         u64 divisor, dividend;
    3875             : 
    3876           0 :         int count_fls, nsec_fls, frequency_fls, sec_fls;
    3877             : 
    3878           0 :         count_fls = fls64(count);
    3879           0 :         nsec_fls = fls64(nsec);
    3880           0 :         frequency_fls = fls64(frequency);
    3881           0 :         sec_fls = 30;
    3882             : 
    3883             :         /*
    3884             :          * We got @count in @nsec, with a target of sample_freq HZ
    3885             :          * the target period becomes:
    3886             :          *
    3887             :          *             @count * 10^9
    3888             :          * period = -------------------
    3889             :          *          @nsec * sample_freq
    3890             :          *
    3891             :          */
    3892             : 
    3893             :         /*
    3894             :          * Reduce accuracy by one bit such that @a and @b converge
    3895             :          * to a similar magnitude.
    3896             :          */
    3897             : #define REDUCE_FLS(a, b)                \
    3898             : do {                                    \
    3899             :         if (a##_fls > b##_fls) {     \
    3900             :                 a >>= 1;          \
    3901             :                 a##_fls--;              \
    3902             :         } else {                        \
    3903             :                 b >>= 1;          \
    3904             :                 b##_fls--;              \
    3905             :         }                               \
    3906             : } while (0)
    3907             : 
    3908             :         /*
    3909             :          * Reduce accuracy until either term fits in a u64, then proceed with
    3910             :          * the other, so that finally we can do a u64/u64 division.
    3911             :          */
    3912           0 :         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
    3913           0 :                 REDUCE_FLS(nsec, frequency);
    3914           0 :                 REDUCE_FLS(sec, count);
    3915             :         }
    3916             : 
    3917           0 :         if (count_fls + sec_fls > 64) {
    3918           0 :                 divisor = nsec * frequency;
    3919             : 
    3920           0 :                 while (count_fls + sec_fls > 64) {
    3921           0 :                         REDUCE_FLS(count, sec);
    3922           0 :                         divisor >>= 1;
    3923             :                 }
    3924             : 
    3925           0 :                 dividend = count * sec;
    3926             :         } else {
    3927           0 :                 dividend = count * sec;
    3928             : 
    3929           0 :                 while (nsec_fls + frequency_fls > 64) {
    3930           0 :                         REDUCE_FLS(nsec, frequency);
    3931           0 :                         dividend >>= 1;
    3932             :                 }
    3933             : 
    3934           0 :                 divisor = nsec * frequency;
    3935             :         }
    3936             : 
    3937           0 :         if (!divisor)
    3938             :                 return dividend;
    3939             : 
    3940           0 :         return div64_u64(dividend, divisor);
    3941             : }
    3942             : 
    3943             : static DEFINE_PER_CPU(int, perf_throttled_count);
    3944             : static DEFINE_PER_CPU(u64, perf_throttled_seq);
    3945             : 
    3946           0 : static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
    3947             : {
    3948           0 :         struct hw_perf_event *hwc = &event->hw;
    3949           0 :         s64 period, sample_period;
    3950           0 :         s64 delta;
    3951             : 
    3952           0 :         period = perf_calculate_period(event, nsec, count);
    3953             : 
    3954           0 :         delta = (s64)(period - hwc->sample_period);
    3955           0 :         delta = (delta + 7) / 8; /* low pass filter */
    3956             : 
    3957           0 :         sample_period = hwc->sample_period + delta;
    3958             : 
    3959           0 :         if (!sample_period)
    3960           0 :                 sample_period = 1;
    3961             : 
    3962           0 :         hwc->sample_period = sample_period;
    3963             : 
    3964           0 :         if (local64_read(&hwc->period_left) > 8*sample_period) {
    3965           0 :                 if (disable)
    3966           0 :                         event->pmu->stop(event, PERF_EF_UPDATE);
    3967             : 
    3968           0 :                 local64_set(&hwc->period_left, 0);
    3969             : 
    3970           0 :                 if (disable)
    3971           0 :                         event->pmu->start(event, PERF_EF_RELOAD);
    3972             :         }
    3973           0 : }
    3974             : 
    3975             : /*
    3976             :  * combine freq adjustment with unthrottling to avoid two passes over the
    3977             :  * events. At the same time, make sure, having freq events does not change
    3978             :  * the rate of unthrottling as that would introduce bias.
    3979             :  */
    3980           0 : static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
    3981             :                                            int needs_unthr)
    3982             : {
    3983           0 :         struct perf_event *event;
    3984           0 :         struct hw_perf_event *hwc;
    3985           0 :         u64 now, period = TICK_NSEC;
    3986           0 :         s64 delta;
    3987             : 
    3988             :         /*
    3989             :          * only need to iterate over all events iff:
    3990             :          * - context have events in frequency mode (needs freq adjust)
    3991             :          * - there are events to unthrottle on this cpu
    3992             :          */
    3993           0 :         if (!(ctx->nr_freq || needs_unthr))
    3994             :                 return;
    3995             : 
    3996           0 :         raw_spin_lock(&ctx->lock);
    3997           0 :         perf_pmu_disable(ctx->pmu);
    3998             : 
    3999           0 :         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
    4000           0 :                 if (event->state != PERF_EVENT_STATE_ACTIVE)
    4001           0 :                         continue;
    4002             : 
    4003           0 :                 if (!event_filter_match(event))
    4004           0 :                         continue;
    4005             : 
    4006           0 :                 perf_pmu_disable(event->pmu);
    4007             : 
    4008           0 :                 hwc = &event->hw;
    4009             : 
    4010           0 :                 if (hwc->interrupts == MAX_INTERRUPTS) {
    4011           0 :                         hwc->interrupts = 0;
    4012           0 :                         perf_log_throttle(event, 1);
    4013           0 :                         event->pmu->start(event, 0);
    4014             :                 }
    4015             : 
    4016           0 :                 if (!event->attr.freq || !event->attr.sample_freq)
    4017           0 :                         goto next;
    4018             : 
    4019             :                 /*
    4020             :                  * stop the event and update event->count
    4021             :                  */
    4022           0 :                 event->pmu->stop(event, PERF_EF_UPDATE);
    4023             : 
    4024           0 :                 now = local64_read(&event->count);
    4025           0 :                 delta = now - hwc->freq_count_stamp;
    4026           0 :                 hwc->freq_count_stamp = now;
    4027             : 
    4028             :                 /*
    4029             :                  * restart the event
    4030             :                  * reload only if value has changed
    4031             :                  * we have stopped the event so tell that
    4032             :                  * to perf_adjust_period() to avoid stopping it
    4033             :                  * twice.
    4034             :                  */
    4035           0 :                 if (delta > 0)
    4036           0 :                         perf_adjust_period(event, period, delta, false);
    4037             : 
    4038           0 :                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
    4039           0 :         next:
    4040           0 :                 perf_pmu_enable(event->pmu);
    4041             :         }
    4042             : 
    4043           0 :         perf_pmu_enable(ctx->pmu);
    4044           0 :         raw_spin_unlock(&ctx->lock);
    4045             : }
    4046             : 
    4047             : /*
    4048             :  * Move @event to the tail of the @ctx's elegible events.
    4049             :  */
    4050           0 : static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
    4051             : {
    4052             :         /*
    4053             :          * Rotate the first entry last of non-pinned groups. Rotation might be
    4054             :          * disabled by the inheritance code.
    4055             :          */
    4056           0 :         if (ctx->rotate_disable)
    4057             :                 return;
    4058             : 
    4059           0 :         perf_event_groups_delete(&ctx->flexible_groups, event);
    4060           0 :         perf_event_groups_insert(&ctx->flexible_groups, event);
    4061             : }
    4062             : 
    4063             : /* pick an event from the flexible_groups to rotate */
    4064             : static inline struct perf_event *
    4065           0 : ctx_event_to_rotate(struct perf_event_context *ctx)
    4066             : {
    4067           0 :         struct perf_event *event;
    4068             : 
    4069             :         /* pick the first active flexible event */
    4070           0 :         event = list_first_entry_or_null(&ctx->flexible_active,
    4071             :                                          struct perf_event, active_list);
    4072             : 
    4073             :         /* if no active flexible event, pick the first event */
    4074           0 :         if (!event) {
    4075           0 :                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
    4076             :                                       typeof(*event), group_node);
    4077             :         }
    4078             : 
    4079             :         /*
    4080             :          * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
    4081             :          * finds there are unschedulable events, it will set it again.
    4082             :          */
    4083           0 :         ctx->rotate_necessary = 0;
    4084             : 
    4085           0 :         return event;
    4086             : }
    4087             : 
    4088           0 : static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
    4089             : {
    4090           0 :         struct perf_event *cpu_event = NULL, *task_event = NULL;
    4091           0 :         struct perf_event_context *task_ctx = NULL;
    4092           0 :         int cpu_rotate, task_rotate;
    4093             : 
    4094             :         /*
    4095             :          * Since we run this from IRQ context, nobody can install new
    4096             :          * events, thus the event count values are stable.
    4097             :          */
    4098             : 
    4099           0 :         cpu_rotate = cpuctx->ctx.rotate_necessary;
    4100           0 :         task_ctx = cpuctx->task_ctx;
    4101           0 :         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
    4102             : 
    4103           0 :         if (!(cpu_rotate || task_rotate))
    4104             :                 return false;
    4105             : 
    4106           0 :         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
    4107           0 :         perf_pmu_disable(cpuctx->ctx.pmu);
    4108             : 
    4109           0 :         if (task_rotate)
    4110           0 :                 task_event = ctx_event_to_rotate(task_ctx);
    4111           0 :         if (cpu_rotate)
    4112           0 :                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
    4113             : 
    4114             :         /*
    4115             :          * As per the order given at ctx_resched() first 'pop' task flexible
    4116             :          * and then, if needed CPU flexible.
    4117             :          */
    4118           0 :         if (task_event || (task_ctx && cpu_event))
    4119           0 :                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
    4120           0 :         if (cpu_event)
    4121           0 :                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
    4122             : 
    4123           0 :         if (task_event)
    4124           0 :                 rotate_ctx(task_ctx, task_event);
    4125           0 :         if (cpu_event)
    4126           0 :                 rotate_ctx(&cpuctx->ctx, cpu_event);
    4127             : 
    4128           0 :         perf_event_sched_in(cpuctx, task_ctx, current);
    4129             : 
    4130           0 :         perf_pmu_enable(cpuctx->ctx.pmu);
    4131           0 :         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
    4132             : 
    4133           0 :         return true;
    4134             : }
    4135             : 
    4136       32228 : void perf_event_task_tick(void)
    4137             : {
    4138       32228 :         struct list_head *head = this_cpu_ptr(&active_ctx_list);
    4139       32415 :         struct perf_event_context *ctx, *tmp;
    4140       32415 :         int throttled;
    4141             : 
    4142       64926 :         lockdep_assert_irqs_disabled();
    4143             : 
    4144       32459 :         __this_cpu_inc(perf_throttled_seq);
    4145       32459 :         throttled = __this_cpu_xchg(perf_throttled_count, 0);
    4146       32459 :         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
    4147             : 
    4148       32459 :         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
    4149           0 :                 perf_adjust_freq_unthr_context(ctx, throttled);
    4150       32463 : }
    4151             : 
    4152           0 : static int event_enable_on_exec(struct perf_event *event,
    4153             :                                 struct perf_event_context *ctx)
    4154             : {
    4155           0 :         if (!event->attr.enable_on_exec)
    4156             :                 return 0;
    4157             : 
    4158           0 :         event->attr.enable_on_exec = 0;
    4159           0 :         if (event->state >= PERF_EVENT_STATE_INACTIVE)
    4160             :                 return 0;
    4161             : 
    4162           0 :         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
    4163             : 
    4164           0 :         return 1;
    4165             : }
    4166             : 
    4167             : /*
    4168             :  * Enable all of a task's events that have been marked enable-on-exec.
    4169             :  * This expects task == current.
    4170             :  */
    4171           0 : static void perf_event_enable_on_exec(int ctxn)
    4172             : {
    4173           0 :         struct perf_event_context *ctx, *clone_ctx = NULL;
    4174           0 :         enum event_type_t event_type = 0;
    4175           0 :         struct perf_cpu_context *cpuctx;
    4176           0 :         struct perf_event *event;
    4177           0 :         unsigned long flags;
    4178           0 :         int enabled = 0;
    4179             : 
    4180           0 :         local_irq_save(flags);
    4181           0 :         ctx = current->perf_event_ctxp[ctxn];
    4182           0 :         if (!ctx || !ctx->nr_events)
    4183           0 :                 goto out;
    4184             : 
    4185           0 :         cpuctx = __get_cpu_context(ctx);
    4186           0 :         perf_ctx_lock(cpuctx, ctx);
    4187           0 :         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
    4188           0 :         list_for_each_entry(event, &ctx->event_list, event_entry) {
    4189           0 :                 enabled |= event_enable_on_exec(event, ctx);
    4190           0 :                 event_type |= get_event_type(event);
    4191             :         }
    4192             : 
    4193             :         /*
    4194             :          * Unclone and reschedule this context if we enabled any event.
    4195             :          */
    4196           0 :         if (enabled) {
    4197           0 :                 clone_ctx = unclone_ctx(ctx);
    4198           0 :                 ctx_resched(cpuctx, ctx, event_type);
    4199             :         } else {
    4200           0 :                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
    4201             :         }
    4202           0 :         perf_ctx_unlock(cpuctx, ctx);
    4203             : 
    4204           0 : out:
    4205           0 :         local_irq_restore(flags);
    4206             : 
    4207           0 :         if (clone_ctx)
    4208           0 :                 put_ctx(clone_ctx);
    4209           0 : }
    4210             : 
    4211             : struct perf_read_data {
    4212             :         struct perf_event *event;
    4213             :         bool group;
    4214             :         int ret;
    4215             : };
    4216             : 
    4217           0 : static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
    4218             : {
    4219           0 :         u16 local_pkg, event_pkg;
    4220             : 
    4221           0 :         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
    4222           0 :                 int local_cpu = smp_processor_id();
    4223             : 
    4224           0 :                 event_pkg = topology_physical_package_id(event_cpu);
    4225           0 :                 local_pkg = topology_physical_package_id(local_cpu);
    4226             : 
    4227           0 :                 if (event_pkg == local_pkg)
    4228           0 :                         return local_cpu;
    4229             :         }
    4230             : 
    4231             :         return event_cpu;
    4232             : }
    4233             : 
    4234             : /*
    4235             :  * Cross CPU call to read the hardware event
    4236             :  */
    4237           0 : static void __perf_event_read(void *info)
    4238             : {
    4239           0 :         struct perf_read_data *data = info;
    4240           0 :         struct perf_event *sub, *event = data->event;
    4241           0 :         struct perf_event_context *ctx = event->ctx;
    4242           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
    4243           0 :         struct pmu *pmu = event->pmu;
    4244             : 
    4245             :         /*
    4246             :          * If this is a task context, we need to check whether it is
    4247             :          * the current task context of this cpu.  If not it has been
    4248             :          * scheduled out before the smp call arrived.  In that case
    4249             :          * event->count would have been updated to a recent sample
    4250             :          * when the event was scheduled out.
    4251             :          */
    4252           0 :         if (ctx->task && cpuctx->task_ctx != ctx)
    4253             :                 return;
    4254             : 
    4255           0 :         raw_spin_lock(&ctx->lock);
    4256           0 :         if (ctx->is_active & EVENT_TIME) {
    4257           0 :                 update_context_time(ctx);
    4258           0 :                 update_cgrp_time_from_event(event);
    4259             :         }
    4260             : 
    4261           0 :         perf_event_update_time(event);
    4262           0 :         if (data->group)
    4263           0 :                 perf_event_update_sibling_time(event);
    4264             : 
    4265           0 :         if (event->state != PERF_EVENT_STATE_ACTIVE)
    4266           0 :                 goto unlock;
    4267             : 
    4268           0 :         if (!data->group) {
    4269           0 :                 pmu->read(event);
    4270           0 :                 data->ret = 0;
    4271           0 :                 goto unlock;
    4272             :         }
    4273             : 
    4274           0 :         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
    4275             : 
    4276           0 :         pmu->read(event);
    4277             : 
    4278           0 :         for_each_sibling_event(sub, event) {
    4279           0 :                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
    4280             :                         /*
    4281             :                          * Use sibling's PMU rather than @event's since
    4282             :                          * sibling could be on different (eg: software) PMU.
    4283             :                          */
    4284           0 :                         sub->pmu->read(sub);
    4285             :                 }
    4286             :         }
    4287             : 
    4288           0 :         data->ret = pmu->commit_txn(pmu);
    4289             : 
    4290           0 : unlock:
    4291           0 :         raw_spin_unlock(&ctx->lock);
    4292             : }
    4293             : 
    4294           0 : static inline u64 perf_event_count(struct perf_event *event)
    4295             : {
    4296           0 :         return local64_read(&event->count) + atomic64_read(&event->child_count);
    4297             : }
    4298             : 
    4299             : /*
    4300             :  * NMI-safe method to read a local event, that is an event that
    4301             :  * is:
    4302             :  *   - either for the current task, or for this CPU
    4303             :  *   - does not have inherit set, for inherited task events
    4304             :  *     will not be local and we cannot read them atomically
    4305             :  *   - must not have a pmu::count method
    4306             :  */
    4307           0 : int perf_event_read_local(struct perf_event *event, u64 *value,
    4308             :                           u64 *enabled, u64 *running)
    4309             : {
    4310           0 :         unsigned long flags;
    4311           0 :         int ret = 0;
    4312             : 
    4313             :         /*
    4314             :          * Disabling interrupts avoids all counter scheduling (context
    4315             :          * switches, timer based rotation and IPIs).
    4316             :          */
    4317           0 :         local_irq_save(flags);
    4318             : 
    4319             :         /*
    4320             :          * It must not be an event with inherit set, we cannot read
    4321             :          * all child counters from atomic context.
    4322             :          */
    4323           0 :         if (event->attr.inherit) {
    4324           0 :                 ret = -EOPNOTSUPP;
    4325           0 :                 goto out;
    4326             :         }
    4327             : 
    4328             :         /* If this is a per-task event, it must be for current */
    4329           0 :         if ((event->attach_state & PERF_ATTACH_TASK) &&
    4330           0 :             event->hw.target != current) {
    4331           0 :                 ret = -EINVAL;
    4332           0 :                 goto out;
    4333             :         }
    4334             : 
    4335             :         /* If this is a per-CPU event, it must be for this CPU */
    4336           0 :         if (!(event->attach_state & PERF_ATTACH_TASK) &&
    4337           0 :             event->cpu != smp_processor_id()) {
    4338           0 :                 ret = -EINVAL;
    4339           0 :                 goto out;
    4340             :         }
    4341             : 
    4342             :         /* If this is a pinned event it must be running on this CPU */
    4343           0 :         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
    4344           0 :                 ret = -EBUSY;
    4345           0 :                 goto out;
    4346             :         }
    4347             : 
    4348             :         /*
    4349             :          * If the event is currently on this CPU, its either a per-task event,
    4350             :          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
    4351             :          * oncpu == -1).
    4352             :          */
    4353           0 :         if (event->oncpu == smp_processor_id())
    4354           0 :                 event->pmu->read(event);
    4355             : 
    4356           0 :         *value = local64_read(&event->count);
    4357           0 :         if (enabled || running) {
    4358           0 :                 u64 now = event->shadow_ctx_time + perf_clock();
    4359           0 :                 u64 __enabled, __running;
    4360             : 
    4361           0 :                 __perf_update_times(event, now, &__enabled, &__running);
    4362           0 :                 if (enabled)
    4363           0 :                         *enabled = __enabled;
    4364           0 :                 if (running)
    4365           0 :                         *running = __running;
    4366             :         }
    4367           0 : out:
    4368           0 :         local_irq_restore(flags);
    4369             : 
    4370           0 :         return ret;
    4371             : }
    4372             : 
    4373           0 : static int perf_event_read(struct perf_event *event, bool group)
    4374             : {
    4375           0 :         enum perf_event_state state = READ_ONCE(event->state);
    4376           0 :         int event_cpu, ret = 0;
    4377             : 
    4378             :         /*
    4379             :          * If event is enabled and currently active on a CPU, update the
    4380             :          * value in the event structure:
    4381             :          */
    4382           0 : again:
    4383           0 :         if (state == PERF_EVENT_STATE_ACTIVE) {
    4384           0 :                 struct perf_read_data data;
    4385             : 
    4386             :                 /*
    4387             :                  * Orders the ->state and ->oncpu loads such that if we see
    4388             :                  * ACTIVE we must also see the right ->oncpu.
    4389             :                  *
    4390             :                  * Matches the smp_wmb() from event_sched_in().
    4391             :                  */
    4392           0 :                 smp_rmb();
    4393             : 
    4394           0 :                 event_cpu = READ_ONCE(event->oncpu);
    4395           0 :                 if ((unsigned)event_cpu >= nr_cpu_ids)
    4396           0 :                         return 0;
    4397             : 
    4398           0 :                 data = (struct perf_read_data){
    4399             :                         .event = event,
    4400             :                         .group = group,
    4401             :                         .ret = 0,
    4402             :                 };
    4403             : 
    4404           0 :                 preempt_disable();
    4405           0 :                 event_cpu = __perf_event_read_cpu(event, event_cpu);
    4406             : 
    4407             :                 /*
    4408             :                  * Purposely ignore the smp_call_function_single() return
    4409             :                  * value.
    4410             :                  *
    4411             :                  * If event_cpu isn't a valid CPU it means the event got
    4412             :                  * scheduled out and that will have updated the event count.
    4413             :                  *
    4414             :                  * Therefore, either way, we'll have an up-to-date event count
    4415             :                  * after this.
    4416             :                  */
    4417           0 :                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
    4418           0 :                 preempt_enable();
    4419           0 :                 ret = data.ret;
    4420             : 
    4421           0 :         } else if (state == PERF_EVENT_STATE_INACTIVE) {
    4422           0 :                 struct perf_event_context *ctx = event->ctx;
    4423           0 :                 unsigned long flags;
    4424             : 
    4425           0 :                 raw_spin_lock_irqsave(&ctx->lock, flags);
    4426           0 :                 state = event->state;
    4427           0 :                 if (state != PERF_EVENT_STATE_INACTIVE) {
    4428           0 :                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
    4429           0 :                         goto again;
    4430             :                 }
    4431             : 
    4432             :                 /*
    4433             :                  * May read while context is not active (e.g., thread is
    4434             :                  * blocked), in that case we cannot update context time
    4435             :                  */
    4436           0 :                 if (ctx->is_active & EVENT_TIME) {
    4437           0 :                         update_context_time(ctx);
    4438           0 :                         update_cgrp_time_from_event(event);
    4439             :                 }
    4440             : 
    4441           0 :                 perf_event_update_time(event);
    4442           0 :                 if (group)
    4443           0 :                         perf_event_update_sibling_time(event);
    4444           0 :                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
    4445             :         }
    4446             : 
    4447             :         return ret;
    4448             : }
    4449             : 
    4450             : /*
    4451             :  * Initialize the perf_event context in a task_struct:
    4452             :  */
    4453           8 : static void __perf_event_init_context(struct perf_event_context *ctx)
    4454             : {
    4455           8 :         raw_spin_lock_init(&ctx->lock);
    4456           8 :         mutex_init(&ctx->mutex);
    4457           8 :         INIT_LIST_HEAD(&ctx->active_ctx_list);
    4458           8 :         perf_event_groups_init(&ctx->pinned_groups);
    4459           8 :         perf_event_groups_init(&ctx->flexible_groups);
    4460           8 :         INIT_LIST_HEAD(&ctx->event_list);
    4461           8 :         INIT_LIST_HEAD(&ctx->pinned_active);
    4462           8 :         INIT_LIST_HEAD(&ctx->flexible_active);
    4463           8 :         refcount_set(&ctx->refcount, 1);
    4464           8 : }
    4465             : 
    4466             : static struct perf_event_context *
    4467           0 : alloc_perf_context(struct pmu *pmu, struct task_struct *task)
    4468             : {
    4469           0 :         struct perf_event_context *ctx;
    4470             : 
    4471           0 :         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
    4472           0 :         if (!ctx)
    4473             :                 return NULL;
    4474             : 
    4475           0 :         __perf_event_init_context(ctx);
    4476           0 :         if (task)
    4477           0 :                 ctx->task = get_task_struct(task);
    4478           0 :         ctx->pmu = pmu;
    4479             : 
    4480           0 :         return ctx;
    4481             : }
    4482             : 
    4483             : static struct task_struct *
    4484           0 : find_lively_task_by_vpid(pid_t vpid)
    4485             : {
    4486           0 :         struct task_struct *task;
    4487             : 
    4488           0 :         rcu_read_lock();
    4489           0 :         if (!vpid)
    4490           0 :                 task = current;
    4491             :         else
    4492           0 :                 task = find_task_by_vpid(vpid);
    4493           0 :         if (task)
    4494           0 :                 get_task_struct(task);
    4495           0 :         rcu_read_unlock();
    4496             : 
    4497           0 :         if (!task)
    4498           0 :                 return ERR_PTR(-ESRCH);
    4499             : 
    4500             :         return task;
    4501             : }
    4502             : 
    4503             : /*
    4504             :  * Returns a matching context with refcount and pincount.
    4505             :  */
    4506             : static struct perf_event_context *
    4507           0 : find_get_context(struct pmu *pmu, struct task_struct *task,
    4508             :                 struct perf_event *event)
    4509             : {
    4510           0 :         struct perf_event_context *ctx, *clone_ctx = NULL;
    4511           0 :         struct perf_cpu_context *cpuctx;
    4512           0 :         void *task_ctx_data = NULL;
    4513           0 :         unsigned long flags;
    4514           0 :         int ctxn, err;
    4515           0 :         int cpu = event->cpu;
    4516             : 
    4517           0 :         if (!task) {
    4518             :                 /* Must be root to operate on a CPU event: */
    4519           0 :                 err = perf_allow_cpu(&event->attr);
    4520           0 :                 if (err)
    4521           0 :                         return ERR_PTR(err);
    4522             : 
    4523           0 :                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
    4524           0 :                 ctx = &cpuctx->ctx;
    4525           0 :                 get_ctx(ctx);
    4526           0 :                 ++ctx->pin_count;
    4527             : 
    4528           0 :                 return ctx;
    4529             :         }
    4530             : 
    4531           0 :         err = -EINVAL;
    4532           0 :         ctxn = pmu->task_ctx_nr;
    4533           0 :         if (ctxn < 0)
    4534           0 :                 goto errout;
    4535             : 
    4536           0 :         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
    4537           0 :                 task_ctx_data = alloc_task_ctx_data(pmu);
    4538           0 :                 if (!task_ctx_data) {
    4539           0 :                         err = -ENOMEM;
    4540           0 :                         goto errout;
    4541             :                 }
    4542             :         }
    4543             : 
    4544           0 : retry:
    4545           0 :         ctx = perf_lock_task_context(task, ctxn, &flags);
    4546           0 :         if (ctx) {
    4547           0 :                 clone_ctx = unclone_ctx(ctx);
    4548           0 :                 ++ctx->pin_count;
    4549             : 
    4550           0 :                 if (task_ctx_data && !ctx->task_ctx_data) {
    4551           0 :                         ctx->task_ctx_data = task_ctx_data;
    4552           0 :                         task_ctx_data = NULL;
    4553             :                 }
    4554           0 :                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
    4555             : 
    4556           0 :                 if (clone_ctx)
    4557           0 :                         put_ctx(clone_ctx);
    4558             :         } else {
    4559           0 :                 ctx = alloc_perf_context(pmu, task);
    4560           0 :                 err = -ENOMEM;
    4561           0 :                 if (!ctx)
    4562           0 :                         goto errout;
    4563             : 
    4564           0 :                 if (task_ctx_data) {
    4565           0 :                         ctx->task_ctx_data = task_ctx_data;
    4566           0 :                         task_ctx_data = NULL;
    4567             :                 }
    4568             : 
    4569           0 :                 err = 0;
    4570           0 :                 mutex_lock(&task->perf_event_mutex);
    4571             :                 /*
    4572             :                  * If it has already passed perf_event_exit_task().
    4573             :                  * we must see PF_EXITING, it takes this mutex too.
    4574             :                  */
    4575           0 :                 if (task->flags & PF_EXITING)
    4576             :                         err = -ESRCH;
    4577           0 :                 else if (task->perf_event_ctxp[ctxn])
    4578             :                         err = -EAGAIN;
    4579             :                 else {
    4580           0 :                         get_ctx(ctx);
    4581           0 :                         ++ctx->pin_count;
    4582           0 :                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
    4583             :                 }
    4584           0 :                 mutex_unlock(&task->perf_event_mutex);
    4585             : 
    4586           0 :                 if (unlikely(err)) {
    4587           0 :                         put_ctx(ctx);
    4588             : 
    4589           0 :                         if (err == -EAGAIN)
    4590           0 :                                 goto retry;
    4591           0 :                         goto errout;
    4592             :                 }
    4593             :         }
    4594             : 
    4595           0 :         free_task_ctx_data(pmu, task_ctx_data);
    4596           0 :         return ctx;
    4597             : 
    4598           0 : errout:
    4599           0 :         free_task_ctx_data(pmu, task_ctx_data);
    4600           0 :         return ERR_PTR(err);
    4601             : }
    4602             : 
    4603             : static void perf_event_free_filter(struct perf_event *event);
    4604             : static void perf_event_free_bpf_prog(struct perf_event *event);
    4605             : 
    4606           0 : static void free_event_rcu(struct rcu_head *head)
    4607             : {
    4608           0 :         struct perf_event *event;
    4609             : 
    4610           0 :         event = container_of(head, struct perf_event, rcu_head);
    4611           0 :         if (event->ns)
    4612           0 :                 put_pid_ns(event->ns);
    4613           0 :         perf_event_free_filter(event);
    4614           0 :         kfree(event);
    4615           0 : }
    4616             : 
    4617             : static void ring_buffer_attach(struct perf_event *event,
    4618             :                                struct perf_buffer *rb);
    4619             : 
    4620           0 : static void detach_sb_event(struct perf_event *event)
    4621             : {
    4622           0 :         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
    4623             : 
    4624           0 :         raw_spin_lock(&pel->lock);
    4625           0 :         list_del_rcu(&event->sb_list);
    4626           0 :         raw_spin_unlock(&pel->lock);
    4627           0 : }
    4628             : 
    4629           0 : static bool is_sb_event(struct perf_event *event)
    4630             : {
    4631           0 :         struct perf_event_attr *attr = &event->attr;
    4632             : 
    4633           0 :         if (event->parent)
    4634             :                 return false;
    4635             : 
    4636           0 :         if (event->attach_state & PERF_ATTACH_TASK)
    4637             :                 return false;
    4638             : 
    4639           0 :         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
    4640             :             attr->comm || attr->comm_exec ||
    4641             :             attr->task || attr->ksymbol ||
    4642           0 :             attr->context_switch || attr->text_poke ||
    4643             :             attr->bpf_event)
    4644           0 :                 return true;
    4645             :         return false;
    4646             : }
    4647             : 
    4648           0 : static void unaccount_pmu_sb_event(struct perf_event *event)
    4649             : {
    4650           0 :         if (is_sb_event(event))
    4651           0 :                 detach_sb_event(event);
    4652           0 : }
    4653             : 
    4654           0 : static void unaccount_event_cpu(struct perf_event *event, int cpu)
    4655             : {
    4656           0 :         if (event->parent)
    4657             :                 return;
    4658             : 
    4659           0 :         if (is_cgroup_event(event))
    4660           0 :                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
    4661             : }
    4662             : 
    4663             : #ifdef CONFIG_NO_HZ_FULL
    4664             : static DEFINE_SPINLOCK(nr_freq_lock);
    4665             : #endif
    4666             : 
    4667             : static void unaccount_freq_event_nohz(void)
    4668             : {
    4669             : #ifdef CONFIG_NO_HZ_FULL
    4670             :         spin_lock(&nr_freq_lock);
    4671             :         if (atomic_dec_and_test(&nr_freq_events))
    4672             :                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
    4673             :         spin_unlock(&nr_freq_lock);
    4674             : #endif
    4675             : }
    4676             : 
    4677           0 : static void unaccount_freq_event(void)
    4678             : {
    4679           0 :         if (tick_nohz_full_enabled())
    4680             :                 unaccount_freq_event_nohz();
    4681             :         else
    4682           0 :                 atomic_dec(&nr_freq_events);
    4683           0 : }
    4684             : 
    4685           0 : static void unaccount_event(struct perf_event *event)
    4686             : {
    4687           0 :         bool dec = false;
    4688             : 
    4689           0 :         if (event->parent)
    4690             :                 return;
    4691             : 
    4692           0 :         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
    4693           0 :                 dec = true;
    4694           0 :         if (event->attr.mmap || event->attr.mmap_data)
    4695           0 :                 atomic_dec(&nr_mmap_events);
    4696           0 :         if (event->attr.build_id)
    4697           0 :                 atomic_dec(&nr_build_id_events);
    4698           0 :         if (event->attr.comm)
    4699           0 :                 atomic_dec(&nr_comm_events);
    4700           0 :         if (event->attr.namespaces)
    4701           0 :                 atomic_dec(&nr_namespaces_events);
    4702           0 :         if (event->attr.cgroup)
    4703           0 :                 atomic_dec(&nr_cgroup_events);
    4704           0 :         if (event->attr.task)
    4705           0 :                 atomic_dec(&nr_task_events);
    4706           0 :         if (event->attr.freq)
    4707           0 :                 unaccount_freq_event();
    4708           0 :         if (event->attr.context_switch) {
    4709           0 :                 dec = true;
    4710           0 :                 atomic_dec(&nr_switch_events);
    4711             :         }
    4712           0 :         if (is_cgroup_event(event))
    4713             :                 dec = true;
    4714           0 :         if (has_branch_stack(event))
    4715           0 :                 dec = true;
    4716           0 :         if (event->attr.ksymbol)
    4717           0 :                 atomic_dec(&nr_ksymbol_events);
    4718           0 :         if (event->attr.bpf_event)
    4719           0 :                 atomic_dec(&nr_bpf_events);
    4720           0 :         if (event->attr.text_poke)
    4721           0 :                 atomic_dec(&nr_text_poke_events);
    4722             : 
    4723           0 :         if (dec) {
    4724           0 :                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
    4725           0 :                         schedule_delayed_work(&perf_sched_work, HZ);
    4726             :         }
    4727             : 
    4728           0 :         unaccount_event_cpu(event, event->cpu);
    4729             : 
    4730           0 :         unaccount_pmu_sb_event(event);
    4731             : }
    4732             : 
    4733           0 : static void perf_sched_delayed(struct work_struct *work)
    4734             : {
    4735           0 :         mutex_lock(&perf_sched_mutex);
    4736           0 :         if (atomic_dec_and_test(&perf_sched_count))
    4737           0 :                 static_branch_disable(&perf_sched_events);
    4738           0 :         mutex_unlock(&perf_sched_mutex);
    4739           0 : }
    4740             : 
    4741             : /*
    4742             :  * The following implement mutual exclusion of events on "exclusive" pmus
    4743             :  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
    4744             :  * at a time, so we disallow creating events that might conflict, namely:
    4745             :  *
    4746             :  *  1) cpu-wide events in the presence of per-task events,
    4747             :  *  2) per-task events in the presence of cpu-wide events,
    4748             :  *  3) two matching events on the same context.
    4749             :  *
    4750             :  * The former two cases are handled in the allocation path (perf_event_alloc(),
    4751             :  * _free_event()), the latter -- before the first perf_install_in_context().
    4752             :  */
    4753           0 : static int exclusive_event_init(struct perf_event *event)
    4754             : {
    4755           0 :         struct pmu *pmu = event->pmu;
    4756             : 
    4757           0 :         if (!is_exclusive_pmu(pmu))
    4758             :                 return 0;
    4759             : 
    4760             :         /*
    4761             :          * Prevent co-existence of per-task and cpu-wide events on the
    4762             :          * same exclusive pmu.
    4763             :          *
    4764             :          * Negative pmu::exclusive_cnt means there are cpu-wide
    4765             :          * events on this "exclusive" pmu, positive means there are
    4766             :          * per-task events.
    4767             :          *
    4768             :          * Since this is called in perf_event_alloc() path, event::ctx
    4769             :          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
    4770             :          * to mean "per-task event", because unlike other attach states it
    4771             :          * never gets cleared.
    4772             :          */
    4773           0 :         if (event->attach_state & PERF_ATTACH_TASK) {
    4774           0 :                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
    4775           0 :                         return -EBUSY;
    4776             :         } else {
    4777           0 :                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
    4778           0 :                         return -EBUSY;
    4779             :         }
    4780             : 
    4781             :         return 0;
    4782             : }
    4783             : 
    4784           0 : static void exclusive_event_destroy(struct perf_event *event)
    4785             : {
    4786           0 :         struct pmu *pmu = event->pmu;
    4787             : 
    4788           0 :         if (!is_exclusive_pmu(pmu))
    4789             :                 return;
    4790             : 
    4791             :         /* see comment in exclusive_event_init() */
    4792           0 :         if (event->attach_state & PERF_ATTACH_TASK)
    4793           0 :                 atomic_dec(&pmu->exclusive_cnt);
    4794             :         else
    4795           0 :                 atomic_inc(&pmu->exclusive_cnt);
    4796             : }
    4797             : 
    4798           0 : static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
    4799             : {
    4800           0 :         if ((e1->pmu == e2->pmu) &&
    4801           0 :             (e1->cpu == e2->cpu ||
    4802           0 :              e1->cpu == -1 ||
    4803             :              e2->cpu == -1))
    4804             :                 return true;
    4805             :         return false;
    4806             : }
    4807             : 
    4808           0 : static bool exclusive_event_installable(struct perf_event *event,
    4809             :                                         struct perf_event_context *ctx)
    4810             : {
    4811           0 :         struct perf_event *iter_event;
    4812           0 :         struct pmu *pmu = event->pmu;
    4813             : 
    4814           0 :         lockdep_assert_held(&ctx->mutex);
    4815             : 
    4816           0 :         if (!is_exclusive_pmu(pmu))
    4817             :                 return true;
    4818             : 
    4819           0 :         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
    4820           0 :                 if (exclusive_event_match(iter_event, event))
    4821             :                         return false;
    4822             :         }
    4823             : 
    4824             :         return true;
    4825             : }
    4826             : 
    4827             : static void perf_addr_filters_splice(struct perf_event *event,
    4828             :                                        struct list_head *head);
    4829             : 
    4830           0 : static void _free_event(struct perf_event *event)
    4831             : {
    4832           0 :         irq_work_sync(&event->pending);
    4833             : 
    4834           0 :         unaccount_event(event);
    4835             : 
    4836           0 :         security_perf_event_free(event);
    4837             : 
    4838           0 :         if (event->rb) {
    4839             :                 /*
    4840             :                  * Can happen when we close an event with re-directed output.
    4841             :                  *
    4842             :                  * Since we have a 0 refcount, perf_mmap_close() will skip
    4843             :                  * over us; possibly making our ring_buffer_put() the last.
    4844             :                  */
    4845           0 :                 mutex_lock(&event->mmap_mutex);
    4846           0 :                 ring_buffer_attach(event, NULL);
    4847           0 :                 mutex_unlock(&event->mmap_mutex);
    4848             :         }
    4849             : 
    4850           0 :         if (is_cgroup_event(event))
    4851           0 :                 perf_detach_cgroup(event);
    4852             : 
    4853           0 :         if (!event->parent) {
    4854           0 :                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
    4855           0 :                         put_callchain_buffers();
    4856             :         }
    4857             : 
    4858           0 :         perf_event_free_bpf_prog(event);
    4859           0 :         perf_addr_filters_splice(event, NULL);
    4860           0 :         kfree(event->addr_filter_ranges);
    4861             : 
    4862           0 :         if (event->destroy)
    4863           0 :                 event->destroy(event);
    4864             : 
    4865             :         /*
    4866             :          * Must be after ->destroy(), due to uprobe_perf_close() using
    4867             :          * hw.target.
    4868             :          */
    4869           0 :         if (event->hw.target)
    4870           0 :                 put_task_struct(event->hw.target);
    4871             : 
    4872             :         /*
    4873             :          * perf_event_free_task() relies on put_ctx() being 'last', in particular
    4874             :          * all task references must be cleaned up.
    4875             :          */
    4876           0 :         if (event->ctx)
    4877           0 :                 put_ctx(event->ctx);
    4878             : 
    4879           0 :         exclusive_event_destroy(event);
    4880           0 :         module_put(event->pmu->module);
    4881             : 
    4882           0 :         call_rcu(&event->rcu_head, free_event_rcu);
    4883           0 : }
    4884             : 
    4885             : /*
    4886             :  * Used to free events which have a known refcount of 1, such as in error paths
    4887             :  * where the event isn't exposed yet and inherited events.
    4888             :  */
    4889           0 : static void free_event(struct perf_event *event)
    4890             : {
    4891           0 :         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
    4892             :                                 "unexpected event refcount: %ld; ptr=%p\n",
    4893             :                                 atomic_long_read(&event->refcount), event)) {
    4894             :                 /* leak to avoid use-after-free */
    4895             :                 return;
    4896             :         }
    4897             : 
    4898           0 :         _free_event(event);
    4899             : }
    4900             : 
    4901             : /*
    4902             :  * Remove user event from the owner task.
    4903             :  */
    4904           0 : static void perf_remove_from_owner(struct perf_event *event)
    4905             : {
    4906           0 :         struct task_struct *owner;
    4907             : 
    4908           0 :         rcu_read_lock();
    4909             :         /*
    4910             :          * Matches the smp_store_release() in perf_event_exit_task(). If we
    4911             :          * observe !owner it means the list deletion is complete and we can
    4912             :          * indeed free this event, otherwise we need to serialize on
    4913             :          * owner->perf_event_mutex.
    4914             :          */
    4915           0 :         owner = READ_ONCE(event->owner);
    4916           0 :         if (owner) {
    4917             :                 /*
    4918             :                  * Since delayed_put_task_struct() also drops the last
    4919             :                  * task reference we can safely take a new reference
    4920             :                  * while holding the rcu_read_lock().
    4921             :                  */
    4922           0 :                 get_task_struct(owner);
    4923             :         }
    4924           0 :         rcu_read_unlock();
    4925             : 
    4926           0 :         if (owner) {
    4927             :                 /*
    4928             :                  * If we're here through perf_event_exit_task() we're already
    4929             :                  * holding ctx->mutex which would be an inversion wrt. the
    4930             :                  * normal lock order.
    4931             :                  *
    4932             :                  * However we can safely take this lock because its the child
    4933             :                  * ctx->mutex.
    4934             :                  */
    4935           0 :                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
    4936             : 
    4937             :                 /*
    4938             :                  * We have to re-check the event->owner field, if it is cleared
    4939             :                  * we raced with perf_event_exit_task(), acquiring the mutex
    4940             :                  * ensured they're done, and we can proceed with freeing the
    4941             :                  * event.
    4942             :                  */
    4943           0 :                 if (event->owner) {
    4944           0 :                         list_del_init(&event->owner_entry);
    4945           0 :                         smp_store_release(&event->owner, NULL);
    4946             :                 }
    4947           0 :                 mutex_unlock(&owner->perf_event_mutex);
    4948           0 :                 put_task_struct(owner);
    4949             :         }
    4950           0 : }
    4951             : 
    4952           0 : static void put_event(struct perf_event *event)
    4953             : {
    4954           0 :         if (!atomic_long_dec_and_test(&event->refcount))
    4955             :                 return;
    4956             : 
    4957           0 :         _free_event(event);
    4958             : }
    4959             : 
    4960             : /*
    4961             :  * Kill an event dead; while event:refcount will preserve the event
    4962             :  * object, it will not preserve its functionality. Once the last 'user'
    4963             :  * gives up the object, we'll destroy the thing.
    4964             :  */
    4965           0 : int perf_event_release_kernel(struct perf_event *event)
    4966             : {
    4967           0 :         struct perf_event_context *ctx = event->ctx;
    4968           0 :         struct perf_event *child, *tmp;
    4969           0 :         LIST_HEAD(free_list);
    4970             : 
    4971             :         /*
    4972             :          * If we got here through err_file: fput(event_file); we will not have
    4973             :          * attached to a context yet.
    4974             :          */
    4975           0 :         if (!ctx) {
    4976           0 :                 WARN_ON_ONCE(event->attach_state &
    4977             :                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
    4978           0 :                 goto no_ctx;
    4979             :         }
    4980             : 
    4981           0 :         if (!is_kernel_event(event))
    4982           0 :                 perf_remove_from_owner(event);
    4983             : 
    4984           0 :         ctx = perf_event_ctx_lock(event);
    4985           0 :         WARN_ON_ONCE(ctx->parent_ctx);
    4986           0 :         perf_remove_from_context(event, DETACH_GROUP);
    4987             : 
    4988           0 :         raw_spin_lock_irq(&ctx->lock);
    4989             :         /*
    4990             :          * Mark this event as STATE_DEAD, there is no external reference to it
    4991             :          * anymore.
    4992             :          *
    4993             :          * Anybody acquiring event->child_mutex after the below loop _must_
    4994             :          * also see this, most importantly inherit_event() which will avoid
    4995             :          * placing more children on the list.
    4996             :          *
    4997             :          * Thus this guarantees that we will in fact observe and kill _ALL_
    4998             :          * child events.
    4999             :          */
    5000           0 :         event->state = PERF_EVENT_STATE_DEAD;
    5001           0 :         raw_spin_unlock_irq(&ctx->lock);
    5002             : 
    5003           0 :         perf_event_ctx_unlock(event, ctx);
    5004             : 
    5005           0 : again:
    5006           0 :         mutex_lock(&event->child_mutex);
    5007           0 :         list_for_each_entry(child, &event->child_list, child_list) {
    5008             : 
    5009             :                 /*
    5010             :                  * Cannot change, child events are not migrated, see the
    5011             :                  * comment with perf_event_ctx_lock_nested().
    5012             :                  */
    5013           0 :                 ctx = READ_ONCE(child->ctx);
    5014             :                 /*
    5015             :                  * Since child_mutex nests inside ctx::mutex, we must jump
    5016             :                  * through hoops. We start by grabbing a reference on the ctx.
    5017             :                  *
    5018             :                  * Since the event cannot get freed while we hold the
    5019             :                  * child_mutex, the context must also exist and have a !0
    5020             :                  * reference count.
    5021             :                  */
    5022           0 :                 get_ctx(ctx);
    5023             : 
    5024             :                 /*
    5025             :                  * Now that we have a ctx ref, we can drop child_mutex, and
    5026             :                  * acquire ctx::mutex without fear of it going away. Then we
    5027             :                  * can re-acquire child_mutex.
    5028             :                  */
    5029           0 :                 mutex_unlock(&event->child_mutex);
    5030           0 :                 mutex_lock(&ctx->mutex);
    5031           0 :                 mutex_lock(&event->child_mutex);
    5032             : 
    5033             :                 /*
    5034             :                  * Now that we hold ctx::mutex and child_mutex, revalidate our
    5035             :                  * state, if child is still the first entry, it didn't get freed
    5036             :                  * and we can continue doing so.
    5037             :                  */
    5038           0 :                 tmp = list_first_entry_or_null(&event->child_list,
    5039             :                                                struct perf_event, child_list);
    5040           0 :                 if (tmp == child) {
    5041           0 :                         perf_remove_from_context(child, DETACH_GROUP);
    5042           0 :                         list_move(&child->child_list, &free_list);
    5043             :                         /*
    5044             :                          * This matches the refcount bump in inherit_event();
    5045             :                          * this can't be the last reference.
    5046             :                          */
    5047           0 :                         put_event(event);
    5048             :                 }
    5049             : 
    5050           0 :                 mutex_unlock(&event->child_mutex);
    5051           0 :                 mutex_unlock(&ctx->mutex);
    5052           0 :                 put_ctx(ctx);
    5053           0 :                 goto again;
    5054             :         }
    5055           0 :         mutex_unlock(&event->child_mutex);
    5056             : 
    5057           0 :         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
    5058           0 :                 void *var = &child->ctx->refcount;
    5059             : 
    5060           0 :                 list_del(&child->child_list);
    5061           0 :                 free_event(child);
    5062             : 
    5063             :                 /*
    5064             :                  * Wake any perf_event_free_task() waiting for this event to be
    5065             :                  * freed.
    5066             :                  */
    5067           0 :                 smp_mb(); /* pairs with wait_var_event() */
    5068           0 :                 wake_up_var(var);
    5069             :         }
    5070             : 
    5071           0 : no_ctx:
    5072           0 :         put_event(event); /* Must be the 'last' reference */
    5073           0 :         return 0;
    5074             : }
    5075             : EXPORT_SYMBOL_GPL(perf_event_release_kernel);
    5076             : 
    5077             : /*
    5078             :  * Called when the last reference to the file is gone.
    5079             :  */
    5080           0 : static int perf_release(struct inode *inode, struct file *file)
    5081             : {
    5082           0 :         perf_event_release_kernel(file->private_data);
    5083           0 :         return 0;
    5084             : }
    5085             : 
    5086           0 : static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
    5087             : {
    5088           0 :         struct perf_event *child;
    5089           0 :         u64 total = 0;
    5090             : 
    5091           0 :         *enabled = 0;
    5092           0 :         *running = 0;
    5093             : 
    5094           0 :         mutex_lock(&event->child_mutex);
    5095             : 
    5096           0 :         (void)perf_event_read(event, false);
    5097           0 :         total += perf_event_count(event);
    5098             : 
    5099           0 :         *enabled += event->total_time_enabled +
    5100           0 :                         atomic64_read(&event->child_total_time_enabled);
    5101           0 :         *running += event->total_time_running +
    5102           0 :                         atomic64_read(&event->child_total_time_running);
    5103             : 
    5104           0 :         list_for_each_entry(child, &event->child_list, child_list) {
    5105           0 :                 (void)perf_event_read(child, false);
    5106           0 :                 total += perf_event_count(child);
    5107           0 :                 *enabled += child->total_time_enabled;
    5108           0 :                 *running += child->total_time_running;
    5109             :         }
    5110           0 :         mutex_unlock(&event->child_mutex);
    5111             : 
    5112           0 :         return total;
    5113             : }
    5114             : 
    5115           0 : u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
    5116             : {
    5117           0 :         struct perf_event_context *ctx;
    5118           0 :         u64 count;
    5119             : 
    5120           0 :         ctx = perf_event_ctx_lock(event);
    5121           0 :         count = __perf_event_read_value(event, enabled, running);
    5122           0 :         perf_event_ctx_unlock(event, ctx);
    5123             : 
    5124           0 :         return count;
    5125             : }
    5126             : EXPORT_SYMBOL_GPL(perf_event_read_value);
    5127             : 
    5128           0 : static int __perf_read_group_add(struct perf_event *leader,
    5129             :                                         u64 read_format, u64 *values)
    5130             : {
    5131           0 :         struct perf_event_context *ctx = leader->ctx;
    5132           0 :         struct perf_event *sub;
    5133           0 :         unsigned long flags;
    5134           0 :         int n = 1; /* skip @nr */
    5135           0 :         int ret;
    5136             : 
    5137           0 :         ret = perf_event_read(leader, true);
    5138           0 :         if (ret)
    5139             :                 return ret;
    5140             : 
    5141           0 :         raw_spin_lock_irqsave(&ctx->lock, flags);
    5142             : 
    5143             :         /*
    5144             :          * Since we co-schedule groups, {enabled,running} times of siblings
    5145             :          * will be identical to those of the leader, so we only publish one
    5146             :          * set.
    5147             :          */
    5148           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
    5149           0 :                 values[n++] += leader->total_time_enabled +
    5150           0 :                         atomic64_read(&leader->child_total_time_enabled);
    5151             :         }
    5152             : 
    5153           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
    5154           0 :                 values[n++] += leader->total_time_running +
    5155           0 :                         atomic64_read(&leader->child_total_time_running);
    5156             :         }
    5157             : 
    5158             :         /*
    5159             :          * Write {count,id} tuples for every sibling.
    5160             :          */
    5161           0 :         values[n++] += perf_event_count(leader);
    5162           0 :         if (read_format & PERF_FORMAT_ID)
    5163           0 :                 values[n++] = primary_event_id(leader);
    5164             : 
    5165           0 :         for_each_sibling_event(sub, leader) {
    5166           0 :                 values[n++] += perf_event_count(sub);
    5167           0 :                 if (read_format & PERF_FORMAT_ID)
    5168           0 :                         values[n++] = primary_event_id(sub);
    5169             :         }
    5170             : 
    5171           0 :         raw_spin_unlock_irqrestore(&ctx->lock, flags);
    5172           0 :         return 0;
    5173             : }
    5174             : 
    5175           0 : static int perf_read_group(struct perf_event *event,
    5176             :                                    u64 read_format, char __user *buf)
    5177             : {
    5178           0 :         struct perf_event *leader = event->group_leader, *child;
    5179           0 :         struct perf_event_context *ctx = leader->ctx;
    5180           0 :         int ret;
    5181           0 :         u64 *values;
    5182             : 
    5183           0 :         lockdep_assert_held(&ctx->mutex);
    5184             : 
    5185           0 :         values = kzalloc(event->read_size, GFP_KERNEL);
    5186           0 :         if (!values)
    5187             :                 return -ENOMEM;
    5188             : 
    5189           0 :         values[0] = 1 + leader->nr_siblings;
    5190             : 
    5191             :         /*
    5192             :          * By locking the child_mutex of the leader we effectively
    5193             :          * lock the child list of all siblings.. XXX explain how.
    5194             :          */
    5195           0 :         mutex_lock(&leader->child_mutex);
    5196             : 
    5197           0 :         ret = __perf_read_group_add(leader, read_format, values);
    5198           0 :         if (ret)
    5199           0 :                 goto unlock;
    5200             : 
    5201           0 :         list_for_each_entry(child, &leader->child_list, child_list) {
    5202           0 :                 ret = __perf_read_group_add(child, read_format, values);
    5203           0 :                 if (ret)
    5204           0 :                         goto unlock;
    5205             :         }
    5206             : 
    5207           0 :         mutex_unlock(&leader->child_mutex);
    5208             : 
    5209           0 :         ret = event->read_size;
    5210           0 :         if (copy_to_user(buf, values, event->read_size))
    5211           0 :                 ret = -EFAULT;
    5212           0 :         goto out;
    5213             : 
    5214           0 : unlock:
    5215           0 :         mutex_unlock(&leader->child_mutex);
    5216           0 : out:
    5217           0 :         kfree(values);
    5218           0 :         return ret;
    5219             : }
    5220             : 
    5221           0 : static int perf_read_one(struct perf_event *event,
    5222             :                                  u64 read_format, char __user *buf)
    5223             : {
    5224           0 :         u64 enabled, running;
    5225           0 :         u64 values[4];
    5226           0 :         int n = 0;
    5227             : 
    5228           0 :         values[n++] = __perf_event_read_value(event, &enabled, &running);
    5229           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
    5230           0 :                 values[n++] = enabled;
    5231           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
    5232           0 :                 values[n++] = running;
    5233           0 :         if (read_format & PERF_FORMAT_ID)
    5234           0 :                 values[n++] = primary_event_id(event);
    5235             : 
    5236           0 :         if (copy_to_user(buf, values, n * sizeof(u64)))
    5237             :                 return -EFAULT;
    5238             : 
    5239           0 :         return n * sizeof(u64);
    5240             : }
    5241             : 
    5242           0 : static bool is_event_hup(struct perf_event *event)
    5243             : {
    5244           0 :         bool no_children;
    5245             : 
    5246           0 :         if (event->state > PERF_EVENT_STATE_EXIT)
    5247             :                 return false;
    5248             : 
    5249           0 :         mutex_lock(&event->child_mutex);
    5250           0 :         no_children = list_empty(&event->child_list);
    5251           0 :         mutex_unlock(&event->child_mutex);
    5252           0 :         return no_children;
    5253             : }
    5254             : 
    5255             : /*
    5256             :  * Read the performance event - simple non blocking version for now
    5257             :  */
    5258             : static ssize_t
    5259           0 : __perf_read(struct perf_event *event, char __user *buf, size_t count)
    5260             : {
    5261           0 :         u64 read_format = event->attr.read_format;
    5262           0 :         int ret;
    5263             : 
    5264             :         /*
    5265             :          * Return end-of-file for a read on an event that is in
    5266             :          * error state (i.e. because it was pinned but it couldn't be
    5267             :          * scheduled on to the CPU at some point).
    5268             :          */
    5269           0 :         if (event->state == PERF_EVENT_STATE_ERROR)
    5270             :                 return 0;
    5271             : 
    5272           0 :         if (count < event->read_size)
    5273             :                 return -ENOSPC;
    5274             : 
    5275           0 :         WARN_ON_ONCE(event->ctx->parent_ctx);
    5276           0 :         if (read_format & PERF_FORMAT_GROUP)
    5277           0 :                 ret = perf_read_group(event, read_format, buf);
    5278             :         else
    5279           0 :                 ret = perf_read_one(event, read_format, buf);
    5280             : 
    5281           0 :         return ret;
    5282             : }
    5283             : 
    5284             : static ssize_t
    5285           0 : perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
    5286             : {
    5287           0 :         struct perf_event *event = file->private_data;
    5288           0 :         struct perf_event_context *ctx;
    5289           0 :         int ret;
    5290             : 
    5291           0 :         ret = security_perf_event_read(event);
    5292           0 :         if (ret)
    5293           0 :                 return ret;
    5294             : 
    5295           0 :         ctx = perf_event_ctx_lock(event);
    5296           0 :         ret = __perf_read(event, buf, count);
    5297           0 :         perf_event_ctx_unlock(event, ctx);
    5298             : 
    5299           0 :         return ret;
    5300             : }
    5301             : 
    5302           0 : static __poll_t perf_poll(struct file *file, poll_table *wait)
    5303             : {
    5304           0 :         struct perf_event *event = file->private_data;
    5305           0 :         struct perf_buffer *rb;
    5306           0 :         __poll_t events = EPOLLHUP;
    5307             : 
    5308           0 :         poll_wait(file, &event->waitq, wait);
    5309             : 
    5310           0 :         if (is_event_hup(event))
    5311             :                 return events;
    5312             : 
    5313             :         /*
    5314             :          * Pin the event->rb by taking event->mmap_mutex; otherwise
    5315             :          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
    5316             :          */
    5317           0 :         mutex_lock(&event->mmap_mutex);
    5318           0 :         rb = event->rb;
    5319           0 :         if (rb)
    5320           0 :                 events = atomic_xchg(&rb->poll, 0);
    5321           0 :         mutex_unlock(&event->mmap_mutex);
    5322           0 :         return events;
    5323             : }
    5324             : 
    5325           0 : static void _perf_event_reset(struct perf_event *event)
    5326             : {
    5327           0 :         (void)perf_event_read(event, false);
    5328           0 :         local64_set(&event->count, 0);
    5329           0 :         perf_event_update_userpage(event);
    5330           0 : }
    5331             : 
    5332             : /* Assume it's not an event with inherit set. */
    5333           0 : u64 perf_event_pause(struct perf_event *event, bool reset)
    5334             : {
    5335           0 :         struct perf_event_context *ctx;
    5336           0 :         u64 count;
    5337             : 
    5338           0 :         ctx = perf_event_ctx_lock(event);
    5339           0 :         WARN_ON_ONCE(event->attr.inherit);
    5340           0 :         _perf_event_disable(event);
    5341           0 :         count = local64_read(&event->count);
    5342           0 :         if (reset)
    5343           0 :                 local64_set(&event->count, 0);
    5344           0 :         perf_event_ctx_unlock(event, ctx);
    5345             : 
    5346           0 :         return count;
    5347             : }
    5348             : EXPORT_SYMBOL_GPL(perf_event_pause);
    5349             : 
    5350             : /*
    5351             :  * Holding the top-level event's child_mutex means that any
    5352             :  * descendant process that has inherited this event will block
    5353             :  * in perf_event_exit_event() if it goes to exit, thus satisfying the
    5354             :  * task existence requirements of perf_event_enable/disable.
    5355             :  */
    5356           0 : static void perf_event_for_each_child(struct perf_event *event,
    5357             :                                         void (*func)(struct perf_event *))
    5358             : {
    5359           0 :         struct perf_event *child;
    5360             : 
    5361           0 :         WARN_ON_ONCE(event->ctx->parent_ctx);
    5362             : 
    5363           0 :         mutex_lock(&event->child_mutex);
    5364           0 :         func(event);
    5365           0 :         list_for_each_entry(child, &event->child_list, child_list)
    5366           0 :                 func(child);
    5367           0 :         mutex_unlock(&event->child_mutex);
    5368           0 : }
    5369             : 
    5370           0 : static void perf_event_for_each(struct perf_event *event,
    5371             :                                   void (*func)(struct perf_event *))
    5372             : {
    5373           0 :         struct perf_event_context *ctx = event->ctx;
    5374           0 :         struct perf_event *sibling;
    5375             : 
    5376           0 :         lockdep_assert_held(&ctx->mutex);
    5377             : 
    5378           0 :         event = event->group_leader;
    5379             : 
    5380           0 :         perf_event_for_each_child(event, func);
    5381           0 :         for_each_sibling_event(sibling, event)
    5382           0 :                 perf_event_for_each_child(sibling, func);
    5383           0 : }
    5384             : 
    5385           0 : static void __perf_event_period(struct perf_event *event,
    5386             :                                 struct perf_cpu_context *cpuctx,
    5387             :                                 struct perf_event_context *ctx,
    5388             :                                 void *info)
    5389             : {
    5390           0 :         u64 value = *((u64 *)info);
    5391           0 :         bool active;
    5392             : 
    5393           0 :         if (event->attr.freq) {
    5394           0 :                 event->attr.sample_freq = value;
    5395             :         } else {
    5396           0 :                 event->attr.sample_period = value;
    5397           0 :                 event->hw.sample_period = value;
    5398             :         }
    5399             : 
    5400           0 :         active = (event->state == PERF_EVENT_STATE_ACTIVE);
    5401           0 :         if (active) {
    5402           0 :                 perf_pmu_disable(ctx->pmu);
    5403             :                 /*
    5404             :                  * We could be throttled; unthrottle now to avoid the tick
    5405             :                  * trying to unthrottle while we already re-started the event.
    5406             :                  */
    5407           0 :                 if (event->hw.interrupts == MAX_INTERRUPTS) {
    5408           0 :                         event->hw.interrupts = 0;
    5409           0 :                         perf_log_throttle(event, 1);
    5410             :                 }
    5411           0 :                 event->pmu->stop(event, PERF_EF_UPDATE);
    5412             :         }
    5413             : 
    5414           0 :         local64_set(&event->hw.period_left, 0);
    5415             : 
    5416           0 :         if (active) {
    5417           0 :                 event->pmu->start(event, PERF_EF_RELOAD);
    5418           0 :                 perf_pmu_enable(ctx->pmu);
    5419             :         }
    5420           0 : }
    5421             : 
    5422           0 : static int perf_event_check_period(struct perf_event *event, u64 value)
    5423             : {
    5424           0 :         return event->pmu->check_period(event, value);
    5425             : }
    5426             : 
    5427           0 : static int _perf_event_period(struct perf_event *event, u64 value)
    5428             : {
    5429           0 :         if (!is_sampling_event(event))
    5430             :                 return -EINVAL;
    5431             : 
    5432           0 :         if (!value)
    5433             :                 return -EINVAL;
    5434             : 
    5435           0 :         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
    5436             :                 return -EINVAL;
    5437             : 
    5438           0 :         if (perf_event_check_period(event, value))
    5439             :                 return -EINVAL;
    5440             : 
    5441           0 :         if (!event->attr.freq && (value & (1ULL << 63)))
    5442             :                 return -EINVAL;
    5443             : 
    5444           0 :         event_function_call(event, __perf_event_period, &value);
    5445             : 
    5446           0 :         return 0;
    5447             : }
    5448             : 
    5449           0 : int perf_event_period(struct perf_event *event, u64 value)
    5450             : {
    5451           0 :         struct perf_event_context *ctx;
    5452           0 :         int ret;
    5453             : 
    5454           0 :         ctx = perf_event_ctx_lock(event);
    5455           0 :         ret = _perf_event_period(event, value);
    5456           0 :         perf_event_ctx_unlock(event, ctx);
    5457             : 
    5458           0 :         return ret;
    5459             : }
    5460             : EXPORT_SYMBOL_GPL(perf_event_period);
    5461             : 
    5462             : static const struct file_operations perf_fops;
    5463             : 
    5464           0 : static inline int perf_fget_light(int fd, struct fd *p)
    5465             : {
    5466           0 :         struct fd f = fdget(fd);
    5467           0 :         if (!f.file)
    5468             :                 return -EBADF;
    5469             : 
    5470           0 :         if (f.file->f_op != &perf_fops) {
    5471           0 :                 fdput(f);
    5472           0 :                 return -EBADF;
    5473             :         }
    5474           0 :         *p = f;
    5475           0 :         return 0;
    5476             : }
    5477             : 
    5478             : static int perf_event_set_output(struct perf_event *event,
    5479             :                                  struct perf_event *output_event);
    5480             : static int perf_event_set_filter(struct perf_event *event, void __user *arg);
    5481             : static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
    5482             : static int perf_copy_attr(struct perf_event_attr __user *uattr,
    5483             :                           struct perf_event_attr *attr);
    5484             : 
    5485           0 : static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
    5486             : {
    5487           0 :         void (*func)(struct perf_event *);
    5488           0 :         u32 flags = arg;
    5489             : 
    5490           0 :         switch (cmd) {
    5491             :         case PERF_EVENT_IOC_ENABLE:
    5492             :                 func = _perf_event_enable;
    5493             :                 break;
    5494           0 :         case PERF_EVENT_IOC_DISABLE:
    5495           0 :                 func = _perf_event_disable;
    5496           0 :                 break;
    5497           0 :         case PERF_EVENT_IOC_RESET:
    5498           0 :                 func = _perf_event_reset;
    5499           0 :                 break;
    5500             : 
    5501           0 :         case PERF_EVENT_IOC_REFRESH:
    5502           0 :                 return _perf_event_refresh(event, arg);
    5503             : 
    5504           0 :         case PERF_EVENT_IOC_PERIOD:
    5505             :         {
    5506           0 :                 u64 value;
    5507             : 
    5508           0 :                 if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
    5509             :                         return -EFAULT;
    5510             : 
    5511           0 :                 return _perf_event_period(event, value);
    5512             :         }
    5513             :         case PERF_EVENT_IOC_ID:
    5514             :         {
    5515           0 :                 u64 id = primary_event_id(event);
    5516             : 
    5517           0 :                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
    5518           0 :                         return -EFAULT;
    5519             :                 return 0;
    5520             :         }
    5521             : 
    5522           0 :         case PERF_EVENT_IOC_SET_OUTPUT:
    5523             :         {
    5524           0 :                 int ret;
    5525           0 :                 if (arg != -1) {
    5526           0 :                         struct perf_event *output_event;
    5527           0 :                         struct fd output;
    5528           0 :                         ret = perf_fget_light(arg, &output);
    5529           0 :                         if (ret)
    5530           0 :                                 return ret;
    5531           0 :                         output_event = output.file->private_data;
    5532           0 :                         ret = perf_event_set_output(event, output_event);
    5533           0 :                         fdput(output);
    5534             :                 } else {
    5535           0 :                         ret = perf_event_set_output(event, NULL);
    5536             :                 }
    5537           0 :                 return ret;
    5538             :         }
    5539             : 
    5540           0 :         case PERF_EVENT_IOC_SET_FILTER:
    5541           0 :                 return perf_event_set_filter(event, (void __user *)arg);
    5542             : 
    5543           0 :         case PERF_EVENT_IOC_SET_BPF:
    5544           0 :                 return perf_event_set_bpf_prog(event, arg);
    5545             : 
    5546             :         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
    5547           0 :                 struct perf_buffer *rb;
    5548             : 
    5549           0 :                 rcu_read_lock();
    5550           0 :                 rb = rcu_dereference(event->rb);
    5551           0 :                 if (!rb || !rb->nr_pages) {
    5552           0 :                         rcu_read_unlock();
    5553           0 :                         return -EINVAL;
    5554             :                 }
    5555           0 :                 rb_toggle_paused(rb, !!arg);
    5556           0 :                 rcu_read_unlock();
    5557           0 :                 return 0;
    5558             :         }
    5559             : 
    5560           0 :         case PERF_EVENT_IOC_QUERY_BPF:
    5561           0 :                 return perf_event_query_prog_array(event, (void __user *)arg);
    5562             : 
    5563           0 :         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
    5564           0 :                 struct perf_event_attr new_attr;
    5565           0 :                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
    5566             :                                          &new_attr);
    5567             : 
    5568           0 :                 if (err)
    5569           0 :                         return err;
    5570             : 
    5571           0 :                 return perf_event_modify_attr(event,  &new_attr);
    5572             :         }
    5573             :         default:
    5574             :                 return -ENOTTY;
    5575             :         }
    5576             : 
    5577           0 :         if (flags & PERF_IOC_FLAG_GROUP)
    5578           0 :                 perf_event_for_each(event, func);
    5579             :         else
    5580           0 :                 perf_event_for_each_child(event, func);
    5581             : 
    5582             :         return 0;
    5583             : }
    5584             : 
    5585           0 : static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
    5586             : {
    5587           0 :         struct perf_event *event = file->private_data;
    5588           0 :         struct perf_event_context *ctx;
    5589           0 :         long ret;
    5590             : 
    5591             :         /* Treat ioctl like writes as it is likely a mutating operation. */
    5592           0 :         ret = security_perf_event_write(event);
    5593           0 :         if (ret)
    5594             :                 return ret;
    5595             : 
    5596           0 :         ctx = perf_event_ctx_lock(event);
    5597           0 :         ret = _perf_ioctl(event, cmd, arg);
    5598           0 :         perf_event_ctx_unlock(event, ctx);
    5599             : 
    5600           0 :         return ret;
    5601             : }
    5602             : 
    5603             : #ifdef CONFIG_COMPAT
    5604           0 : static long perf_compat_ioctl(struct file *file, unsigned int cmd,
    5605             :                                 unsigned long arg)
    5606             : {
    5607           0 :         switch (_IOC_NR(cmd)) {
    5608           0 :         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
    5609             :         case _IOC_NR(PERF_EVENT_IOC_ID):
    5610             :         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
    5611             :         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
    5612             :                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
    5613           0 :                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
    5614           0 :                         cmd &= ~IOCSIZE_MASK;
    5615           0 :                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
    5616             :                 }
    5617             :                 break;
    5618             :         }
    5619           0 :         return perf_ioctl(file, cmd, arg);
    5620             : }
    5621             : #else
    5622             : # define perf_compat_ioctl NULL
    5623             : #endif
    5624             : 
    5625           0 : int perf_event_task_enable(void)
    5626             : {
    5627           0 :         struct perf_event_context *ctx;
    5628           0 :         struct perf_event *event;
    5629             : 
    5630           0 :         mutex_lock(&current->perf_event_mutex);
    5631           0 :         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
    5632           0 :                 ctx = perf_event_ctx_lock(event);
    5633           0 :                 perf_event_for_each_child(event, _perf_event_enable);
    5634           0 :                 perf_event_ctx_unlock(event, ctx);
    5635             :         }
    5636           0 :         mutex_unlock(&current->perf_event_mutex);
    5637             : 
    5638           0 :         return 0;
    5639             : }
    5640             : 
    5641           0 : int perf_event_task_disable(void)
    5642             : {
    5643           0 :         struct perf_event_context *ctx;
    5644           0 :         struct perf_event *event;
    5645             : 
    5646           0 :         mutex_lock(&current->perf_event_mutex);
    5647           0 :         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
    5648           0 :                 ctx = perf_event_ctx_lock(event);
    5649           0 :                 perf_event_for_each_child(event, _perf_event_disable);
    5650           0 :                 perf_event_ctx_unlock(event, ctx);
    5651             :         }
    5652           0 :         mutex_unlock(&current->perf_event_mutex);
    5653             : 
    5654           0 :         return 0;
    5655             : }
    5656             : 
    5657           0 : static int perf_event_index(struct perf_event *event)
    5658             : {
    5659           0 :         if (event->hw.state & PERF_HES_STOPPED)
    5660             :                 return 0;
    5661             : 
    5662           0 :         if (event->state != PERF_EVENT_STATE_ACTIVE)
    5663             :                 return 0;
    5664             : 
    5665           0 :         return event->pmu->event_idx(event);
    5666             : }
    5667             : 
    5668           0 : static void calc_timer_values(struct perf_event *event,
    5669             :                                 u64 *now,
    5670             :                                 u64 *enabled,
    5671             :                                 u64 *running)
    5672             : {
    5673           0 :         u64 ctx_time;
    5674             : 
    5675           0 :         *now = perf_clock();
    5676           0 :         ctx_time = event->shadow_ctx_time + *now;
    5677           0 :         __perf_update_times(event, ctx_time, enabled, running);
    5678           0 : }
    5679             : 
    5680           0 : static void perf_event_init_userpage(struct perf_event *event)
    5681             : {
    5682           0 :         struct perf_event_mmap_page *userpg;
    5683           0 :         struct perf_buffer *rb;
    5684             : 
    5685           0 :         rcu_read_lock();
    5686           0 :         rb = rcu_dereference(event->rb);
    5687           0 :         if (!rb)
    5688           0 :                 goto unlock;
    5689             : 
    5690           0 :         userpg = rb->user_page;
    5691             : 
    5692             :         /* Allow new userspace to detect that bit 0 is deprecated */
    5693           0 :         userpg->cap_bit0_is_deprecated = 1;
    5694           0 :         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
    5695           0 :         userpg->data_offset = PAGE_SIZE;
    5696           0 :         userpg->data_size = perf_data_size(rb);
    5697             : 
    5698           0 : unlock:
    5699           0 :         rcu_read_unlock();
    5700           0 : }
    5701             : 
    5702           0 : void __weak arch_perf_update_userpage(
    5703             :         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
    5704             : {
    5705           0 : }
    5706             : 
    5707             : /*
    5708             :  * Callers need to ensure there can be no nesting of this function, otherwise
    5709             :  * the seqlock logic goes bad. We can not serialize this because the arch
    5710             :  * code calls this from NMI context.
    5711             :  */
    5712           0 : void perf_event_update_userpage(struct perf_event *event)
    5713             : {
    5714           0 :         struct perf_event_mmap_page *userpg;
    5715           0 :         struct perf_buffer *rb;
    5716           0 :         u64 enabled, running, now;
    5717             : 
    5718           0 :         rcu_read_lock();
    5719           0 :         rb = rcu_dereference(event->rb);
    5720           0 :         if (!rb)
    5721           0 :                 goto unlock;
    5722             : 
    5723             :         /*
    5724             :          * compute total_time_enabled, total_time_running
    5725             :          * based on snapshot values taken when the event
    5726             :          * was last scheduled in.
    5727             :          *
    5728             :          * we cannot simply called update_context_time()
    5729             :          * because of locking issue as we can be called in
    5730             :          * NMI context
    5731             :          */
    5732           0 :         calc_timer_values(event, &now, &enabled, &running);
    5733             : 
    5734           0 :         userpg = rb->user_page;
    5735             :         /*
    5736             :          * Disable preemption to guarantee consistent time stamps are stored to
    5737             :          * the user page.
    5738             :          */
    5739           0 :         preempt_disable();
    5740           0 :         ++userpg->lock;
    5741           0 :         barrier();
    5742           0 :         userpg->index = perf_event_index(event);
    5743           0 :         userpg->offset = perf_event_count(event);
    5744           0 :         if (userpg->index)
    5745           0 :                 userpg->offset -= local64_read(&event->hw.prev_count);
    5746             : 
    5747           0 :         userpg->time_enabled = enabled +
    5748           0 :                         atomic64_read(&event->child_total_time_enabled);
    5749             : 
    5750           0 :         userpg->time_running = running +
    5751           0 :                         atomic64_read(&event->child_total_time_running);
    5752             : 
    5753           0 :         arch_perf_update_userpage(event, userpg, now);
    5754             : 
    5755           0 :         barrier();
    5756           0 :         ++userpg->lock;
    5757           0 :         preempt_enable();
    5758           0 : unlock:
    5759           0 :         rcu_read_unlock();
    5760           0 : }
    5761             : EXPORT_SYMBOL_GPL(perf_event_update_userpage);
    5762             : 
    5763           0 : static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
    5764             : {
    5765           0 :         struct perf_event *event = vmf->vma->vm_file->private_data;
    5766           0 :         struct perf_buffer *rb;
    5767           0 :         vm_fault_t ret = VM_FAULT_SIGBUS;
    5768             : 
    5769           0 :         if (vmf->flags & FAULT_FLAG_MKWRITE) {
    5770           0 :                 if (vmf->pgoff == 0)
    5771           0 :                         ret = 0;
    5772           0 :                 return ret;
    5773             :         }
    5774             : 
    5775           0 :         rcu_read_lock();
    5776           0 :         rb = rcu_dereference(event->rb);
    5777           0 :         if (!rb)
    5778           0 :                 goto unlock;
    5779             : 
    5780           0 :         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
    5781           0 :                 goto unlock;
    5782             : 
    5783           0 :         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
    5784           0 :         if (!vmf->page)
    5785           0 :                 goto unlock;
    5786             : 
    5787           0 :         get_page(vmf->page);
    5788           0 :         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
    5789           0 :         vmf->page->index   = vmf->pgoff;
    5790             : 
    5791           0 :         ret = 0;
    5792           0 : unlock:
    5793           0 :         rcu_read_unlock();
    5794             : 
    5795           0 :         return ret;
    5796             : }
    5797             : 
    5798           0 : static void ring_buffer_attach(struct perf_event *event,
    5799             :                                struct perf_buffer *rb)
    5800             : {
    5801           0 :         struct perf_buffer *old_rb = NULL;
    5802           0 :         unsigned long flags;
    5803             : 
    5804           0 :         if (event->rb) {
    5805             :                 /*
    5806             :                  * Should be impossible, we set this when removing
    5807             :                  * event->rb_entry and wait/clear when adding event->rb_entry.
    5808             :                  */
    5809           0 :                 WARN_ON_ONCE(event->rcu_pending);
    5810             : 
    5811           0 :                 old_rb = event->rb;
    5812           0 :                 spin_lock_irqsave(&old_rb->event_lock, flags);
    5813           0 :                 list_del_rcu(&event->rb_entry);
    5814           0 :                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
    5815             : 
    5816           0 :                 event->rcu_batches = get_state_synchronize_rcu();
    5817           0 :                 event->rcu_pending = 1;
    5818             :         }
    5819             : 
    5820           0 :         if (rb) {
    5821           0 :                 if (event->rcu_pending) {
    5822           0 :                         cond_synchronize_rcu(event->rcu_batches);
    5823           0 :                         event->rcu_pending = 0;
    5824             :                 }
    5825             : 
    5826           0 :                 spin_lock_irqsave(&rb->event_lock, flags);
    5827           0 :                 list_add_rcu(&event->rb_entry, &rb->event_list);
    5828           0 :                 spin_unlock_irqrestore(&rb->event_lock, flags);
    5829             :         }
    5830             : 
    5831             :         /*
    5832             :          * Avoid racing with perf_mmap_close(AUX): stop the event
    5833             :          * before swizzling the event::rb pointer; if it's getting
    5834             :          * unmapped, its aux_mmap_count will be 0 and it won't
    5835             :          * restart. See the comment in __perf_pmu_output_stop().
    5836             :          *
    5837             :          * Data will inevitably be lost when set_output is done in
    5838             :          * mid-air, but then again, whoever does it like this is
    5839             :          * not in for the data anyway.
    5840             :          */
    5841           0 :         if (has_aux(event))
    5842           0 :                 perf_event_stop(event, 0);
    5843             : 
    5844           0 :         rcu_assign_pointer(event->rb, rb);
    5845             : 
    5846           0 :         if (old_rb) {
    5847           0 :                 ring_buffer_put(old_rb);
    5848             :                 /*
    5849             :                  * Since we detached before setting the new rb, so that we
    5850             :                  * could attach the new rb, we could have missed a wakeup.
    5851             :                  * Provide it now.
    5852             :                  */
    5853           0 :                 wake_up_all(&event->waitq);
    5854             :         }
    5855           0 : }
    5856             : 
    5857           0 : static void ring_buffer_wakeup(struct perf_event *event)
    5858             : {
    5859           0 :         struct perf_buffer *rb;
    5860             : 
    5861           0 :         rcu_read_lock();
    5862           0 :         rb = rcu_dereference(event->rb);
    5863           0 :         if (rb) {
    5864           0 :                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
    5865           0 :                         wake_up_all(&event->waitq);
    5866             :         }
    5867           0 :         rcu_read_unlock();
    5868           0 : }
    5869             : 
    5870           0 : struct perf_buffer *ring_buffer_get(struct perf_event *event)
    5871             : {
    5872           0 :         struct perf_buffer *rb;
    5873             : 
    5874           0 :         rcu_read_lock();
    5875           0 :         rb = rcu_dereference(event->rb);
    5876           0 :         if (rb) {
    5877           0 :                 if (!refcount_inc_not_zero(&rb->refcount))
    5878           0 :                         rb = NULL;
    5879             :         }
    5880           0 :         rcu_read_unlock();
    5881             : 
    5882           0 :         return rb;
    5883             : }
    5884             : 
    5885           0 : void ring_buffer_put(struct perf_buffer *rb)
    5886             : {
    5887           0 :         if (!refcount_dec_and_test(&rb->refcount))
    5888             :                 return;
    5889             : 
    5890           0 :         WARN_ON_ONCE(!list_empty(&rb->event_list));
    5891             : 
    5892           0 :         call_rcu(&rb->rcu_head, rb_free_rcu);
    5893             : }
    5894             : 
    5895           0 : static void perf_mmap_open(struct vm_area_struct *vma)
    5896             : {
    5897           0 :         struct perf_event *event = vma->vm_file->private_data;
    5898             : 
    5899           0 :         atomic_inc(&event->mmap_count);
    5900           0 :         atomic_inc(&event->rb->mmap_count);
    5901             : 
    5902           0 :         if (vma->vm_pgoff)
    5903           0 :                 atomic_inc(&event->rb->aux_mmap_count);
    5904             : 
    5905           0 :         if (event->pmu->event_mapped)
    5906           0 :                 event->pmu->event_mapped(event, vma->vm_mm);
    5907           0 : }
    5908             : 
    5909             : static void perf_pmu_output_stop(struct perf_event *event);
    5910             : 
    5911             : /*
    5912             :  * A buffer can be mmap()ed multiple times; either directly through the same
    5913             :  * event, or through other events by use of perf_event_set_output().
    5914             :  *
    5915             :  * In order to undo the VM accounting done by perf_mmap() we need to destroy
    5916             :  * the buffer here, where we still have a VM context. This means we need
    5917             :  * to detach all events redirecting to us.
    5918             :  */
    5919           0 : static void perf_mmap_close(struct vm_area_struct *vma)
    5920             : {
    5921           0 :         struct perf_event *event = vma->vm_file->private_data;
    5922           0 :         struct perf_buffer *rb = ring_buffer_get(event);
    5923           0 :         struct user_struct *mmap_user = rb->mmap_user;
    5924           0 :         int mmap_locked = rb->mmap_locked;
    5925           0 :         unsigned long size = perf_data_size(rb);
    5926           0 :         bool detach_rest = false;
    5927             : 
    5928           0 :         if (event->pmu->event_unmapped)
    5929           0 :                 event->pmu->event_unmapped(event, vma->vm_mm);
    5930             : 
    5931             :         /*
    5932             :          * rb->aux_mmap_count will always drop before rb->mmap_count and
    5933             :          * event->mmap_count, so it is ok to use event->mmap_mutex to
    5934             :          * serialize with perf_mmap here.
    5935             :          */
    5936           0 :         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
    5937           0 :             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
    5938             :                 /*
    5939             :                  * Stop all AUX events that are writing to this buffer,
    5940             :                  * so that we can free its AUX pages and corresponding PMU
    5941             :                  * data. Note that after rb::aux_mmap_count dropped to zero,
    5942             :                  * they won't start any more (see perf_aux_output_begin()).
    5943             :                  */
    5944           0 :                 perf_pmu_output_stop(event);
    5945             : 
    5946             :                 /* now it's safe to free the pages */
    5947           0 :                 atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
    5948           0 :                 atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);
    5949             : 
    5950             :                 /* this has to be the last one */
    5951           0 :                 rb_free_aux(rb);
    5952           0 :                 WARN_ON_ONCE(refcount_read(&rb->aux_refcount));
    5953             : 
    5954           0 :                 mutex_unlock(&event->mmap_mutex);
    5955             :         }
    5956             : 
    5957           0 :         if (atomic_dec_and_test(&rb->mmap_count))
    5958           0 :                 detach_rest = true;
    5959             : 
    5960           0 :         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
    5961           0 :                 goto out_put;
    5962             : 
    5963           0 :         ring_buffer_attach(event, NULL);
    5964           0 :         mutex_unlock(&event->mmap_mutex);
    5965             : 
    5966             :         /* If there's still other mmap()s of this buffer, we're done. */
    5967           0 :         if (!detach_rest)
    5968           0 :                 goto out_put;
    5969             : 
    5970             :         /*
    5971             :          * No other mmap()s, detach from all other events that might redirect
    5972             :          * into the now unreachable buffer. Somewhat complicated by the
    5973             :          * fact that rb::event_lock otherwise nests inside mmap_mutex.
    5974             :          */
    5975           0 : again:
    5976           0 :         rcu_read_lock();
    5977           0 :         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
    5978           0 :                 if (!atomic_long_inc_not_zero(&event->refcount)) {
    5979             :                         /*
    5980             :                          * This event is en-route to free_event() which will
    5981             :                          * detach it and remove it from the list.
    5982             :                          */
    5983           0 :                         continue;
    5984             :                 }
    5985           0 :                 rcu_read_unlock();
    5986             : 
    5987           0 :                 mutex_lock(&event->mmap_mutex);
    5988             :                 /*
    5989             :                  * Check we didn't race with perf_event_set_output() which can
    5990             :                  * swizzle the rb from under us while we were waiting to
    5991             :                  * acquire mmap_mutex.
    5992             :                  *
    5993             :                  * If we find a different rb; ignore this event, a next
    5994             :                  * iteration will no longer find it on the list. We have to
    5995             :                  * still restart the iteration to make sure we're not now
    5996             :                  * iterating the wrong list.
    5997             :                  */
    5998           0 :                 if (event->rb == rb)
    5999           0 :                         ring_buffer_attach(event, NULL);
    6000             : 
    6001           0 :                 mutex_unlock(&event->mmap_mutex);
    6002           0 :                 put_event(event);
    6003             : 
    6004             :                 /*
    6005             :                  * Restart the iteration; either we're on the wrong list or
    6006             :                  * destroyed its integrity by doing a deletion.
    6007             :                  */
    6008           0 :                 goto again;
    6009             :         }
    6010           0 :         rcu_read_unlock();
    6011             : 
    6012             :         /*
    6013             :          * It could be there's still a few 0-ref events on the list; they'll
    6014             :          * get cleaned up by free_event() -- they'll also still have their
    6015             :          * ref on the rb and will free it whenever they are done with it.
    6016             :          *
    6017             :          * Aside from that, this buffer is 'fully' detached and unmapped,
    6018             :          * undo the VM accounting.
    6019             :          */
    6020             : 
    6021           0 :         atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
    6022             :                         &mmap_user->locked_vm);
    6023           0 :         atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
    6024           0 :         free_uid(mmap_user);
    6025             : 
    6026           0 : out_put:
    6027           0 :         ring_buffer_put(rb); /* could be last */
    6028           0 : }
    6029             : 
    6030             : static const struct vm_operations_struct perf_mmap_vmops = {
    6031             :         .open           = perf_mmap_open,
    6032             :         .close          = perf_mmap_close, /* non mergeable */
    6033             :         .fault          = perf_mmap_fault,
    6034             :         .page_mkwrite   = perf_mmap_fault,
    6035             : };
    6036             : 
    6037           0 : static int perf_mmap(struct file *file, struct vm_area_struct *vma)
    6038             : {
    6039           0 :         struct perf_event *event = file->private_data;
    6040           0 :         unsigned long user_locked, user_lock_limit;
    6041           0 :         struct user_struct *user = current_user();
    6042           0 :         struct perf_buffer *rb = NULL;
    6043           0 :         unsigned long locked, lock_limit;
    6044           0 :         unsigned long vma_size;
    6045           0 :         unsigned long nr_pages;
    6046           0 :         long user_extra = 0, extra = 0;
    6047           0 :         int ret = 0, flags = 0;
    6048             : 
    6049             :         /*
    6050             :          * Don't allow mmap() of inherited per-task counters. This would
    6051             :          * create a performance issue due to all children writing to the
    6052             :          * same rb.
    6053             :          */
    6054           0 :         if (event->cpu == -1 && event->attr.inherit)
    6055             :                 return -EINVAL;
    6056             : 
    6057           0 :         if (!(vma->vm_flags & VM_SHARED))
    6058             :                 return -EINVAL;
    6059             : 
    6060           0 :         ret = security_perf_event_read(event);
    6061           0 :         if (ret)
    6062             :                 return ret;
    6063             : 
    6064           0 :         vma_size = vma->vm_end - vma->vm_start;
    6065             : 
    6066           0 :         if (vma->vm_pgoff == 0) {
    6067           0 :                 nr_pages = (vma_size / PAGE_SIZE) - 1;
    6068             :         } else {
    6069             :                 /*
    6070             :                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
    6071             :                  * mapped, all subsequent mappings should have the same size
    6072             :                  * and offset. Must be above the normal perf buffer.
    6073             :                  */
    6074           0 :                 u64 aux_offset, aux_size;
    6075             : 
    6076           0 :                 if (!event->rb)
    6077             :                         return -EINVAL;
    6078             : 
    6079           0 :                 nr_pages = vma_size / PAGE_SIZE;
    6080             : 
    6081           0 :                 mutex_lock(&event->mmap_mutex);
    6082           0 :                 ret = -EINVAL;
    6083             : 
    6084           0 :                 rb = event->rb;
    6085           0 :                 if (!rb)
    6086           0 :                         goto aux_unlock;
    6087             : 
    6088           0 :                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
    6089           0 :                 aux_size = READ_ONCE(rb->user_page->aux_size);
    6090             : 
    6091           0 :                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
    6092           0 :                         goto aux_unlock;
    6093             : 
    6094           0 :                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
    6095           0 :                         goto aux_unlock;
    6096             : 
    6097             :                 /* already mapped with a different offset */
    6098           0 :                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
    6099           0 :                         goto aux_unlock;
    6100             : 
    6101           0 :                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
    6102           0 :                         goto aux_unlock;
    6103             : 
    6104             :                 /* already mapped with a different size */
    6105           0 :                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
    6106           0 :                         goto aux_unlock;
    6107             : 
    6108           0 :                 if (!is_power_of_2(nr_pages))
    6109           0 :                         goto aux_unlock;
    6110             : 
    6111           0 :                 if (!atomic_inc_not_zero(&rb->mmap_count))
    6112           0 :                         goto aux_unlock;
    6113             : 
    6114           0 :                 if (rb_has_aux(rb)) {
    6115           0 :                         atomic_inc(&rb->aux_mmap_count);
    6116           0 :                         ret = 0;
    6117           0 :                         goto unlock;
    6118             :                 }
    6119             : 
    6120           0 :                 atomic_set(&rb->aux_mmap_count, 1);
    6121           0 :                 user_extra = nr_pages;
    6122             : 
    6123           0 :                 goto accounting;
    6124             :         }
    6125             : 
    6126             :         /*
    6127             :          * If we have rb pages ensure they're a power-of-two number, so we
    6128             :          * can do bitmasks instead of modulo.
    6129             :          */
    6130           0 :         if (nr_pages != 0 && !is_power_of_2(nr_pages))
    6131             :                 return -EINVAL;
    6132             : 
    6133           0 :         if (vma_size != PAGE_SIZE * (1 + nr_pages))
    6134             :                 return -EINVAL;
    6135             : 
    6136           0 :         WARN_ON_ONCE(event->ctx->parent_ctx);
    6137             : again:
    6138           0 :         mutex_lock(&event->mmap_mutex);
    6139           0 :         if (event->rb) {
    6140           0 :                 if (event->rb->nr_pages != nr_pages) {
    6141           0 :                         ret = -EINVAL;
    6142           0 :                         goto unlock;
    6143             :                 }
    6144             : 
    6145           0 :                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
    6146             :                         /*
    6147             :                          * Raced against perf_mmap_close() through
    6148             :                          * perf_event_set_output(). Try again, hope for better
    6149             :                          * luck.
    6150             :                          */
    6151           0 :                         mutex_unlock(&event->mmap_mutex);
    6152           0 :                         goto again;
    6153             :                 }
    6154             : 
    6155           0 :                 goto unlock;
    6156             :         }
    6157             : 
    6158           0 :         user_extra = nr_pages + 1;
    6159             : 
    6160           0 : accounting:
    6161           0 :         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
    6162             : 
    6163             :         /*
    6164             :          * Increase the limit linearly with more CPUs:
    6165             :          */
    6166           0 :         user_lock_limit *= num_online_cpus();
    6167             : 
    6168           0 :         user_locked = atomic_long_read(&user->locked_vm);
    6169             : 
    6170             :         /*
    6171             :          * sysctl_perf_event_mlock may have changed, so that
    6172             :          *     user->locked_vm > user_lock_limit
    6173             :          */
    6174           0 :         if (user_locked > user_lock_limit)
    6175             :                 user_locked = user_lock_limit;
    6176           0 :         user_locked += user_extra;
    6177             : 
    6178           0 :         if (user_locked > user_lock_limit) {
    6179             :                 /*
    6180             :                  * charge locked_vm until it hits user_lock_limit;
    6181             :                  * charge the rest from pinned_vm
    6182             :                  */
    6183           0 :                 extra = user_locked - user_lock_limit;
    6184           0 :                 user_extra -= extra;
    6185             :         }
    6186             : 
    6187           0 :         lock_limit = rlimit(RLIMIT_MEMLOCK);
    6188           0 :         lock_limit >>= PAGE_SHIFT;
    6189           0 :         locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
    6190             : 
    6191           0 :         if ((locked > lock_limit) && perf_is_paranoid() &&
    6192           0 :                 !capable(CAP_IPC_LOCK)) {
    6193           0 :                 ret = -EPERM;
    6194           0 :                 goto unlock;
    6195             :         }
    6196             : 
    6197           0 :         WARN_ON(!rb && event->rb);
    6198             : 
    6199           0 :         if (vma->vm_flags & VM_WRITE)
    6200           0 :                 flags |= RING_BUFFER_WRITABLE;
    6201             : 
    6202           0 :         if (!rb) {
    6203           0 :                 rb = rb_alloc(nr_pages,
    6204           0 :                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
    6205             :                               event->cpu, flags);
    6206             : 
    6207           0 :                 if (!rb) {
    6208           0 :                         ret = -ENOMEM;
    6209           0 :                         goto unlock;
    6210             :                 }
    6211             : 
    6212           0 :                 atomic_set(&rb->mmap_count, 1);
    6213           0 :                 rb->mmap_user = get_current_user();
    6214           0 :                 rb->mmap_locked = extra;
    6215             : 
    6216           0 :                 ring_buffer_attach(event, rb);
    6217             : 
    6218           0 :                 perf_event_init_userpage(event);
    6219           0 :                 perf_event_update_userpage(event);
    6220             :         } else {
    6221           0 :                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
    6222           0 :                                    event->attr.aux_watermark, flags);
    6223           0 :                 if (!ret)
    6224           0 :                         rb->aux_mmap_locked = extra;
    6225             :         }
    6226             : 
    6227           0 : unlock:
    6228           0 :         if (!ret) {
    6229           0 :                 atomic_long_add(user_extra, &user->locked_vm);
    6230           0 :                 atomic64_add(extra, &vma->vm_mm->pinned_vm);
    6231             : 
    6232           0 :                 atomic_inc(&event->mmap_count);
    6233           0 :         } else if (rb) {
    6234           0 :                 atomic_dec(&rb->mmap_count);
    6235             :         }
    6236           0 : aux_unlock:
    6237           0 :         mutex_unlock(&event->mmap_mutex);
    6238             : 
    6239             :         /*
    6240             :          * Since pinned accounting is per vm we cannot allow fork() to copy our
    6241             :          * vma.
    6242             :          */
    6243           0 :         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
    6244           0 :         vma->vm_ops = &perf_mmap_vmops;
    6245             : 
    6246           0 :         if (event->pmu->event_mapped)
    6247           0 :                 event->pmu->event_mapped(event, vma->vm_mm);
    6248             : 
    6249             :         return ret;
    6250             : }
    6251             : 
    6252           0 : static int perf_fasync(int fd, struct file *filp, int on)
    6253             : {
    6254           0 :         struct inode *inode = file_inode(filp);
    6255           0 :         struct perf_event *event = filp->private_data;
    6256           0 :         int retval;
    6257             : 
    6258           0 :         inode_lock(inode);
    6259           0 :         retval = fasync_helper(fd, filp, on, &event->fasync);
    6260           0 :         inode_unlock(inode);
    6261             : 
    6262           0 :         if (retval < 0)
    6263             :                 return retval;
    6264             : 
    6265             :         return 0;
    6266             : }
    6267             : 
    6268             : static const struct file_operations perf_fops = {
    6269             :         .llseek                 = no_llseek,
    6270             :         .release                = perf_release,
    6271             :         .read                   = perf_read,
    6272             :         .poll                   = perf_poll,
    6273             :         .unlocked_ioctl         = perf_ioctl,
    6274             :         .compat_ioctl           = perf_compat_ioctl,
    6275             :         .mmap                   = perf_mmap,
    6276             :         .fasync                 = perf_fasync,
    6277             : };
    6278             : 
    6279             : /*
    6280             :  * Perf event wakeup
    6281             :  *
    6282             :  * If there's data, ensure we set the poll() state and publish everything
    6283             :  * to user-space before waking everybody up.
    6284             :  */
    6285             : 
    6286           0 : static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
    6287             : {
    6288             :         /* only the parent has fasync state */
    6289           0 :         if (event->parent)
    6290           0 :                 event = event->parent;
    6291           0 :         return &event->fasync;
    6292             : }
    6293             : 
    6294           0 : void perf_event_wakeup(struct perf_event *event)
    6295             : {
    6296           0 :         ring_buffer_wakeup(event);
    6297             : 
    6298           0 :         if (event->pending_kill) {
    6299           0 :                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
    6300           0 :                 event->pending_kill = 0;
    6301             :         }
    6302           0 : }
    6303             : 
    6304           0 : static void perf_pending_event_disable(struct perf_event *event)
    6305             : {
    6306           0 :         int cpu = READ_ONCE(event->pending_disable);
    6307             : 
    6308           0 :         if (cpu < 0)
    6309             :                 return;
    6310             : 
    6311           0 :         if (cpu == smp_processor_id()) {
    6312           0 :                 WRITE_ONCE(event->pending_disable, -1);
    6313           0 :                 perf_event_disable_local(event);
    6314           0 :                 return;
    6315             :         }
    6316             : 
    6317             :         /*
    6318             :          *  CPU-A                       CPU-B
    6319             :          *
    6320             :          *  perf_event_disable_inatomic()
    6321             :          *    @pending_disable = CPU-A;
    6322             :          *    irq_work_queue();
    6323             :          *
    6324             :          *  sched-out
    6325             :          *    @pending_disable = -1;
    6326             :          *
    6327             :          *                              sched-in
    6328             :          *                              perf_event_disable_inatomic()
    6329             :          *                                @pending_disable = CPU-B;
    6330             :          *                                irq_work_queue(); // FAILS
    6331             :          *
    6332             :          *  irq_work_run()
    6333             :          *    perf_pending_event()
    6334             :          *
    6335             :          * But the event runs on CPU-B and wants disabling there.
    6336             :          */
    6337           0 :         irq_work_queue_on(&event->pending, cpu);
    6338             : }
    6339             : 
    6340           0 : static void perf_pending_event(struct irq_work *entry)
    6341             : {
    6342           0 :         struct perf_event *event = container_of(entry, struct perf_event, pending);
    6343           0 :         int rctx;
    6344             : 
    6345           0 :         rctx = perf_swevent_get_recursion_context();
    6346             :         /*
    6347             :          * If we 'fail' here, that's OK, it means recursion is already disabled
    6348             :          * and we won't recurse 'further'.
    6349             :          */
    6350             : 
    6351           0 :         perf_pending_event_disable(event);
    6352             : 
    6353           0 :         if (event->pending_wakeup) {
    6354           0 :                 event->pending_wakeup = 0;
    6355           0 :                 perf_event_wakeup(event);
    6356             :         }
    6357             : 
    6358           0 :         if (rctx >= 0)
    6359           0 :                 perf_swevent_put_recursion_context(rctx);
    6360           0 : }
    6361             : 
    6362             : /*
    6363             :  * We assume there is only KVM supporting the callbacks.
    6364             :  * Later on, we might change it to a list if there is
    6365             :  * another virtualization implementation supporting the callbacks.
    6366             :  */
    6367             : struct perf_guest_info_callbacks *perf_guest_cbs;
    6368             : 
    6369           0 : int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
    6370             : {
    6371           0 :         perf_guest_cbs = cbs;
    6372           0 :         return 0;
    6373             : }
    6374             : EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
    6375             : 
    6376           0 : int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
    6377             : {
    6378           0 :         perf_guest_cbs = NULL;
    6379           0 :         return 0;
    6380             : }
    6381             : EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
    6382             : 
    6383             : static void
    6384           0 : perf_output_sample_regs(struct perf_output_handle *handle,
    6385             :                         struct pt_regs *regs, u64 mask)
    6386             : {
    6387           0 :         int bit;
    6388           0 :         DECLARE_BITMAP(_mask, 64);
    6389             : 
    6390           0 :         bitmap_from_u64(_mask, mask);
    6391           0 :         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
    6392           0 :                 u64 val;
    6393             : 
    6394           0 :                 val = perf_reg_value(regs, bit);
    6395           0 :                 perf_output_put(handle, val);
    6396             :         }
    6397           0 : }
    6398             : 
    6399           0 : static void perf_sample_regs_user(struct perf_regs *regs_user,
    6400             :                                   struct pt_regs *regs)
    6401             : {
    6402           0 :         if (user_mode(regs)) {
    6403           0 :                 regs_user->abi = perf_reg_abi(current);
    6404           0 :                 regs_user->regs = regs;
    6405           0 :         } else if (!(current->flags & PF_KTHREAD)) {
    6406           0 :                 perf_get_regs_user(regs_user, regs);
    6407             :         } else {
    6408           0 :                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
    6409           0 :                 regs_user->regs = NULL;
    6410             :         }
    6411           0 : }
    6412             : 
    6413           0 : static void perf_sample_regs_intr(struct perf_regs *regs_intr,
    6414             :                                   struct pt_regs *regs)
    6415             : {
    6416           0 :         regs_intr->regs = regs;
    6417           0 :         regs_intr->abi  = perf_reg_abi(current);
    6418             : }
    6419             : 
    6420             : 
    6421             : /*
    6422             :  * Get remaining task size from user stack pointer.
    6423             :  *
    6424             :  * It'd be better to take stack vma map and limit this more
    6425             :  * precisely, but there's no way to get it safely under interrupt,
    6426             :  * so using TASK_SIZE as limit.
    6427             :  */
    6428           0 : static u64 perf_ustack_task_size(struct pt_regs *regs)
    6429             : {
    6430           0 :         unsigned long addr = perf_user_stack_pointer(regs);
    6431             : 
    6432           0 :         if (!addr || addr >= TASK_SIZE)
    6433           0 :                 return 0;
    6434             : 
    6435           0 :         return TASK_SIZE - addr;
    6436             : }
    6437             : 
    6438             : static u16
    6439           0 : perf_sample_ustack_size(u16 stack_size, u16 header_size,
    6440             :                         struct pt_regs *regs)
    6441             : {
    6442           0 :         u64 task_size;
    6443             : 
    6444             :         /* No regs, no stack pointer, no dump. */
    6445           0 :         if (!regs)
    6446             :                 return 0;
    6447             : 
    6448             :         /*
    6449             :          * Check if we fit in with the requested stack size into the:
    6450             :          * - TASK_SIZE
    6451             :          *   If we don't, we limit the size to the TASK_SIZE.
    6452             :          *
    6453             :          * - remaining sample size
    6454             :          *   If we don't, we customize the stack size to
    6455             :          *   fit in to the remaining sample size.
    6456             :          */
    6457             : 
    6458           0 :         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
    6459           0 :         stack_size = min(stack_size, (u16) task_size);
    6460             : 
    6461             :         /* Current header size plus static size and dynamic size. */
    6462           0 :         header_size += 2 * sizeof(u64);
    6463             : 
    6464             :         /* Do we fit in with the current stack dump size? */
    6465           0 :         if ((u16) (header_size + stack_size) < header_size) {
    6466             :                 /*
    6467             :                  * If we overflow the maximum size for the sample,
    6468             :                  * we customize the stack dump size to fit in.
    6469             :                  */
    6470           0 :                 stack_size = USHRT_MAX - header_size - sizeof(u64);
    6471           0 :                 stack_size = round_up(stack_size, sizeof(u64));
    6472             :         }
    6473             : 
    6474             :         return stack_size;
    6475             : }
    6476             : 
    6477             : static void
    6478           0 : perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
    6479             :                           struct pt_regs *regs)
    6480             : {
    6481             :         /* Case of a kernel thread, nothing to dump */
    6482           0 :         if (!regs) {
    6483           0 :                 u64 size = 0;
    6484           0 :                 perf_output_put(handle, size);
    6485             :         } else {
    6486           0 :                 unsigned long sp;
    6487           0 :                 unsigned int rem;
    6488           0 :                 u64 dyn_size;
    6489           0 :                 mm_segment_t fs;
    6490             : 
    6491             :                 /*
    6492             :                  * We dump:
    6493             :                  * static size
    6494             :                  *   - the size requested by user or the best one we can fit
    6495             :                  *     in to the sample max size
    6496             :                  * data
    6497             :                  *   - user stack dump data
    6498             :                  * dynamic size
    6499             :                  *   - the actual dumped size
    6500             :                  */
    6501             : 
    6502             :                 /* Static size. */
    6503           0 :                 perf_output_put(handle, dump_size);
    6504             : 
    6505             :                 /* Data. */
    6506           0 :                 sp = perf_user_stack_pointer(regs);
    6507           0 :                 fs = force_uaccess_begin();
    6508           0 :                 rem = __output_copy_user(handle, (void *) sp, dump_size);
    6509           0 :                 force_uaccess_end(fs);
    6510           0 :                 dyn_size = dump_size - rem;
    6511             : 
    6512           0 :                 perf_output_skip(handle, rem);
    6513             : 
    6514             :                 /* Dynamic size. */
    6515           0 :                 perf_output_put(handle, dyn_size);
    6516             :         }
    6517           0 : }
    6518             : 
    6519           0 : static unsigned long perf_prepare_sample_aux(struct perf_event *event,
    6520             :                                           struct perf_sample_data *data,
    6521             :                                           size_t size)
    6522             : {
    6523           0 :         struct perf_event *sampler = event->aux_event;
    6524           0 :         struct perf_buffer *rb;
    6525             : 
    6526           0 :         data->aux_size = 0;
    6527             : 
    6528           0 :         if (!sampler)
    6529           0 :                 goto out;
    6530             : 
    6531           0 :         if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
    6532           0 :                 goto out;
    6533             : 
    6534           0 :         if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
    6535           0 :                 goto out;
    6536             : 
    6537           0 :         rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
    6538           0 :         if (!rb)
    6539           0 :                 goto out;
    6540             : 
    6541             :         /*
    6542             :          * If this is an NMI hit inside sampling code, don't take
    6543             :          * the sample. See also perf_aux_sample_output().
    6544             :          */
    6545           0 :         if (READ_ONCE(rb->aux_in_sampling)) {
    6546           0 :                 data->aux_size = 0;
    6547             :         } else {
    6548           0 :                 size = min_t(size_t, size, perf_aux_size(rb));
    6549           0 :                 data->aux_size = ALIGN(size, sizeof(u64));
    6550             :         }
    6551           0 :         ring_buffer_put(rb);
    6552             : 
    6553           0 : out:
    6554           0 :         return data->aux_size;
    6555             : }
    6556             : 
    6557           0 : long perf_pmu_snapshot_aux(struct perf_buffer *rb,
    6558             :                            struct perf_event *event,
    6559             :                            struct perf_output_handle *handle,
    6560             :                            unsigned long size)
    6561             : {
    6562           0 :         unsigned long flags;
    6563           0 :         long ret;
    6564             : 
    6565             :         /*
    6566             :          * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
    6567             :          * paths. If we start calling them in NMI context, they may race with
    6568             :          * the IRQ ones, that is, for example, re-starting an event that's just
    6569             :          * been stopped, which is why we're using a separate callback that
    6570             :          * doesn't change the event state.
    6571             :          *
    6572             :          * IRQs need to be disabled to prevent IPIs from racing with us.
    6573             :          */
    6574           0 :         local_irq_save(flags);
    6575             :         /*
    6576             :          * Guard against NMI hits inside the critical section;
    6577             :          * see also perf_prepare_sample_aux().
    6578             :          */
    6579           0 :         WRITE_ONCE(rb->aux_in_sampling, 1);
    6580           0 :         barrier();
    6581             : 
    6582           0 :         ret = event->pmu->snapshot_aux(event, handle, size);
    6583             : 
    6584           0 :         barrier();
    6585           0 :         WRITE_ONCE(rb->aux_in_sampling, 0);
    6586           0 :         local_irq_restore(flags);
    6587             : 
    6588           0 :         return ret;
    6589             : }
    6590             : 
    6591           0 : static void perf_aux_sample_output(struct perf_event *event,
    6592             :                                    struct perf_output_handle *handle,
    6593             :                                    struct perf_sample_data *data)
    6594             : {
    6595           0 :         struct perf_event *sampler = event->aux_event;
    6596           0 :         struct perf_buffer *rb;
    6597           0 :         unsigned long pad;
    6598           0 :         long size;
    6599             : 
    6600           0 :         if (WARN_ON_ONCE(!sampler || !data->aux_size))
    6601             :                 return;
    6602             : 
    6603           0 :         rb = ring_buffer_get(sampler->parent ? sampler->parent : sampler);
    6604           0 :         if (!rb)
    6605             :                 return;
    6606             : 
    6607           0 :         size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);
    6608             : 
    6609             :         /*
    6610             :          * An error here means that perf_output_copy() failed (returned a
    6611             :          * non-zero surplus that it didn't copy), which in its current
    6612             :          * enlightened implementation is not possible. If that changes, we'd
    6613             :          * like to know.
    6614             :          */
    6615           0 :         if (WARN_ON_ONCE(size < 0))
    6616           0 :                 goto out_put;
    6617             : 
    6618             :         /*
    6619             :          * The pad comes from ALIGN()ing data->aux_size up to u64 in
    6620             :          * perf_prepare_sample_aux(), so should not be more than that.
    6621             :          */
    6622           0 :         pad = data->aux_size - size;
    6623           0 :         if (WARN_ON_ONCE(pad >= sizeof(u64)))
    6624             :                 pad = 8;
    6625             : 
    6626           0 :         if (pad) {
    6627           0 :                 u64 zero = 0;
    6628           0 :                 perf_output_copy(handle, &zero, pad);
    6629             :         }
    6630             : 
    6631           0 : out_put:
    6632           0 :         ring_buffer_put(rb);
    6633             : }
    6634             : 
    6635           0 : static void __perf_event_header__init_id(struct perf_event_header *header,
    6636             :                                          struct perf_sample_data *data,
    6637             :                                          struct perf_event *event)
    6638             : {
    6639           0 :         u64 sample_type = event->attr.sample_type;
    6640             : 
    6641           0 :         data->type = sample_type;
    6642           0 :         header->size += event->id_header_size;
    6643             : 
    6644           0 :         if (sample_type & PERF_SAMPLE_TID) {
    6645             :                 /* namespace issues */
    6646           0 :                 data->tid_entry.pid = perf_event_pid(event, current);
    6647           0 :                 data->tid_entry.tid = perf_event_tid(event, current);
    6648             :         }
    6649             : 
    6650           0 :         if (sample_type & PERF_SAMPLE_TIME)
    6651           0 :                 data->time = perf_event_clock(event);
    6652             : 
    6653           0 :         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
    6654           0 :                 data->id = primary_event_id(event);
    6655             : 
    6656           0 :         if (sample_type & PERF_SAMPLE_STREAM_ID)
    6657           0 :                 data->stream_id = event->id;
    6658             : 
    6659           0 :         if (sample_type & PERF_SAMPLE_CPU) {
    6660           0 :                 data->cpu_entry.cpu   = raw_smp_processor_id();
    6661           0 :                 data->cpu_entry.reserved = 0;
    6662             :         }
    6663           0 : }
    6664             : 
    6665           0 : void perf_event_header__init_id(struct perf_event_header *header,
    6666             :                                 struct perf_sample_data *data,
    6667             :                                 struct perf_event *event)
    6668             : {
    6669           0 :         if (event->attr.sample_id_all)
    6670           0 :                 __perf_event_header__init_id(header, data, event);
    6671           0 : }
    6672             : 
    6673           0 : static void __perf_event__output_id_sample(struct perf_output_handle *handle,
    6674             :                                            struct perf_sample_data *data)
    6675             : {
    6676           0 :         u64 sample_type = data->type;
    6677             : 
    6678           0 :         if (sample_type & PERF_SAMPLE_TID)
    6679           0 :                 perf_output_put(handle, data->tid_entry);
    6680             : 
    6681           0 :         if (sample_type & PERF_SAMPLE_TIME)
    6682           0 :                 perf_output_put(handle, data->time);
    6683             : 
    6684           0 :         if (sample_type & PERF_SAMPLE_ID)
    6685           0 :                 perf_output_put(handle, data->id);
    6686             : 
    6687           0 :         if (sample_type & PERF_SAMPLE_STREAM_ID)
    6688           0 :                 perf_output_put(handle, data->stream_id);
    6689             : 
    6690           0 :         if (sample_type & PERF_SAMPLE_CPU)
    6691           0 :                 perf_output_put(handle, data->cpu_entry);
    6692             : 
    6693           0 :         if (sample_type & PERF_SAMPLE_IDENTIFIER)
    6694           0 :                 perf_output_put(handle, data->id);
    6695           0 : }
    6696             : 
    6697           0 : void perf_event__output_id_sample(struct perf_event *event,
    6698             :                                   struct perf_output_handle *handle,
    6699             :                                   struct perf_sample_data *sample)
    6700             : {
    6701           0 :         if (event->attr.sample_id_all)
    6702           0 :                 __perf_event__output_id_sample(handle, sample);
    6703           0 : }
    6704             : 
    6705           0 : static void perf_output_read_one(struct perf_output_handle *handle,
    6706             :                                  struct perf_event *event,
    6707             :                                  u64 enabled, u64 running)
    6708             : {
    6709           0 :         u64 read_format = event->attr.read_format;
    6710           0 :         u64 values[4];
    6711           0 :         int n = 0;
    6712             : 
    6713           0 :         values[n++] = perf_event_count(event);
    6714           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
    6715           0 :                 values[n++] = enabled +
    6716           0 :                         atomic64_read(&event->child_total_time_enabled);
    6717             :         }
    6718           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
    6719           0 :                 values[n++] = running +
    6720           0 :                         atomic64_read(&event->child_total_time_running);
    6721             :         }
    6722           0 :         if (read_format & PERF_FORMAT_ID)
    6723           0 :                 values[n++] = primary_event_id(event);
    6724             : 
    6725           0 :         __output_copy(handle, values, n * sizeof(u64));
    6726           0 : }
    6727             : 
    6728           0 : static void perf_output_read_group(struct perf_output_handle *handle,
    6729             :                             struct perf_event *event,
    6730             :                             u64 enabled, u64 running)
    6731             : {
    6732           0 :         struct perf_event *leader = event->group_leader, *sub;
    6733           0 :         u64 read_format = event->attr.read_format;
    6734           0 :         u64 values[5];
    6735           0 :         int n = 0;
    6736             : 
    6737           0 :         values[n++] = 1 + leader->nr_siblings;
    6738             : 
    6739           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
    6740           0 :                 values[n++] = enabled;
    6741             : 
    6742           0 :         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
    6743           0 :                 values[n++] = running;
    6744             : 
    6745           0 :         if ((leader != event) &&
    6746           0 :             (leader->state == PERF_EVENT_STATE_ACTIVE))
    6747           0 :                 leader->pmu->read(leader);
    6748             : 
    6749           0 :         values[n++] = perf_event_count(leader);
    6750           0 :         if (read_format & PERF_FORMAT_ID)
    6751           0 :                 values[n++] = primary_event_id(leader);
    6752             : 
    6753           0 :         __output_copy(handle, values, n * sizeof(u64));
    6754             : 
    6755           0 :         for_each_sibling_event(sub, leader) {
    6756           0 :                 n = 0;
    6757             : 
    6758           0 :                 if ((sub != event) &&
    6759           0 :                     (sub->state == PERF_EVENT_STATE_ACTIVE))
    6760           0 :                         sub->pmu->read(sub);
    6761             : 
    6762           0 :                 values[n++] = perf_event_count(sub);
    6763           0 :                 if (read_format & PERF_FORMAT_ID)
    6764           0 :                         values[n++] = primary_event_id(sub);
    6765             : 
    6766           0 :                 __output_copy(handle, values, n * sizeof(u64));
    6767             :         }
    6768           0 : }
    6769             : 
    6770             : #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
    6771             :                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
    6772             : 
    6773             : /*
    6774             :  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
    6775             :  *
    6776             :  * The problem is that its both hard and excessively expensive to iterate the
    6777             :  * child list, not to mention that its impossible to IPI the children running
    6778             :  * on another CPU, from interrupt/NMI context.
    6779             :  */
    6780           0 : static void perf_output_read(struct perf_output_handle *handle,
    6781             :                              struct perf_event *event)
    6782             : {
    6783           0 :         u64 enabled = 0, running = 0, now;
    6784           0 :         u64 read_format = event->attr.read_format;
    6785             : 
    6786             :         /*
    6787             :          * compute total_time_enabled, total_time_running
    6788             :          * based on snapshot values taken when the event
    6789             :          * was last scheduled in.
    6790             :          *
    6791             :          * we cannot simply called update_context_time()
    6792             :          * because of locking issue as we are called in
    6793             :          * NMI context
    6794             :          */
    6795           0 :         if (read_format & PERF_FORMAT_TOTAL_TIMES)
    6796           0 :                 calc_timer_values(event, &now, &enabled, &running);
    6797             : 
    6798           0 :         if (event->attr.read_format & PERF_FORMAT_GROUP)
    6799           0 :                 perf_output_read_group(handle, event, enabled, running);
    6800             :         else
    6801           0 :                 perf_output_read_one(handle, event, enabled, running);
    6802           0 : }
    6803             : 
    6804           0 : static inline bool perf_sample_save_hw_index(struct perf_event *event)
    6805             : {
    6806           0 :         return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
    6807             : }
    6808             : 
    6809           0 : void perf_output_sample(struct perf_output_handle *handle,
    6810             :                         struct perf_event_header *header,
    6811             :                         struct perf_sample_data *data,
    6812             :                         struct perf_event *event)
    6813             : {
    6814           0 :         u64 sample_type = data->type;
    6815             : 
    6816           0 :         perf_output_put(handle, *header);
    6817             : 
    6818           0 :         if (sample_type & PERF_SAMPLE_IDENTIFIER)
    6819           0 :                 perf_output_put(handle, data->id);
    6820             : 
    6821           0 :         if (sample_type & PERF_SAMPLE_IP)
    6822           0 :                 perf_output_put(handle, data->ip);
    6823             : 
    6824           0 :         if (sample_type & PERF_SAMPLE_TID)
    6825           0 :                 perf_output_put(handle, data->tid_entry);
    6826             : 
    6827           0 :         if (sample_type & PERF_SAMPLE_TIME)
    6828           0 :                 perf_output_put(handle, data->time);
    6829             : 
    6830           0 :         if (sample_type & PERF_SAMPLE_ADDR)
    6831           0 :                 perf_output_put(handle, data->addr);
    6832             : 
    6833           0 :         if (sample_type & PERF_SAMPLE_ID)
    6834           0 :                 perf_output_put(handle, data->id);
    6835             : 
    6836           0 :         if (sample_type & PERF_SAMPLE_STREAM_ID)
    6837           0 :                 perf_output_put(handle, data->stream_id);
    6838             : 
    6839           0 :         if (sample_type & PERF_SAMPLE_CPU)
    6840           0 :                 perf_output_put(handle, data->cpu_entry);
    6841             : 
    6842           0 :         if (sample_type & PERF_SAMPLE_PERIOD)
    6843           0 :                 perf_output_put(handle, data->period);
    6844             : 
    6845           0 :         if (sample_type & PERF_SAMPLE_READ)
    6846           0 :                 perf_output_read(handle, event);
    6847             : 
    6848           0 :         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
    6849           0 :                 int size = 1;
    6850             : 
    6851           0 :                 size += data->callchain->nr;
    6852           0 :                 size *= sizeof(u64);
    6853           0 :                 __output_copy(handle, data->callchain, size);
    6854             :         }
    6855             : 
    6856           0 :         if (sample_type & PERF_SAMPLE_RAW) {
    6857           0 :                 struct perf_raw_record *raw = data->raw;
    6858             : 
    6859           0 :                 if (raw) {
    6860           0 :                         struct perf_raw_frag *frag = &raw->frag;
    6861             : 
    6862           0 :                         perf_output_put(handle, raw->size);
    6863           0 :                         do {
    6864           0 :                                 if (frag->copy) {
    6865           0 :                                         __output_custom(handle, frag->copy,
    6866           0 :                                                         frag->data, frag->size);
    6867             :                                 } else {
    6868           0 :                                         __output_copy(handle, frag->data,
    6869           0 :                                                       frag->size);
    6870             :                                 }
    6871           0 :                                 if (perf_raw_frag_last(frag))
    6872             :                                         break;
    6873           0 :                                 frag = frag->next;
    6874           0 :                         } while (1);
    6875           0 :                         if (frag->pad)
    6876           0 :                                 __output_skip(handle, NULL, frag->pad);
    6877             :                 } else {
    6878           0 :                         struct {
    6879             :                                 u32     size;
    6880             :                                 u32     data;
    6881           0 :                         } raw = {
    6882             :                                 .size = sizeof(u32),
    6883             :                                 .data = 0,
    6884             :                         };
    6885           0 :                         perf_output_put(handle, raw);
    6886             :                 }
    6887             :         }
    6888             : 
    6889           0 :         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
    6890           0 :                 if (data->br_stack) {
    6891           0 :                         size_t size;
    6892             : 
    6893           0 :                         size = data->br_stack->nr
    6894             :                              * sizeof(struct perf_branch_entry);
    6895             : 
    6896           0 :                         perf_output_put(handle, data->br_stack->nr);
    6897           0 :                         if (perf_sample_save_hw_index(event))
    6898           0 :                                 perf_output_put(handle, data->br_stack->hw_idx);
    6899           0 :                         perf_output_copy(handle, data->br_stack->entries, size);
    6900             :                 } else {
    6901             :                         /*
    6902             :                          * we always store at least the value of nr
    6903             :                          */
    6904           0 :                         u64 nr = 0;
    6905           0 :                         perf_output_put(handle, nr);
    6906             :                 }
    6907             :         }
    6908             : 
    6909           0 :         if (sample_type & PERF_SAMPLE_REGS_USER) {
    6910           0 :                 u64 abi = data->regs_user.abi;
    6911             : 
    6912             :                 /*
    6913             :                  * If there are no regs to dump, notice it through
    6914             :                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
    6915             :                  */
    6916           0 :                 perf_output_put(handle, abi);
    6917             : 
    6918           0 :                 if (abi) {
    6919           0 :                         u64 mask = event->attr.sample_regs_user;
    6920           0 :                         perf_output_sample_regs(handle,
    6921             :                                                 data->regs_user.regs,
    6922             :                                                 mask);
    6923             :                 }
    6924             :         }
    6925             : 
    6926           0 :         if (sample_type & PERF_SAMPLE_STACK_USER) {
    6927           0 :                 perf_output_sample_ustack(handle,
    6928             :                                           data->stack_user_size,
    6929             :                                           data->regs_user.regs);
    6930             :         }
    6931             : 
    6932           0 :         if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
    6933           0 :                 perf_output_put(handle, data->weight.full);
    6934             : 
    6935           0 :         if (sample_type & PERF_SAMPLE_DATA_SRC)
    6936           0 :                 perf_output_put(handle, data->data_src.val);
    6937             : 
    6938           0 :         if (sample_type & PERF_SAMPLE_TRANSACTION)
    6939           0 :                 perf_output_put(handle, data->txn);
    6940             : 
    6941           0 :         if (sample_type & PERF_SAMPLE_REGS_INTR) {
    6942           0 :                 u64 abi = data->regs_intr.abi;
    6943             :                 /*
    6944             :                  * If there are no regs to dump, notice it through
    6945             :                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
    6946             :                  */
    6947           0 :                 perf_output_put(handle, abi);
    6948             : 
    6949           0 :                 if (abi) {
    6950           0 :                         u64 mask = event->attr.sample_regs_intr;
    6951             : 
    6952           0 :                         perf_output_sample_regs(handle,
    6953             :                                                 data->regs_intr.regs,
    6954             :                                                 mask);
    6955             :                 }
    6956             :         }
    6957             : 
    6958           0 :         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
    6959           0 :                 perf_output_put(handle, data->phys_addr);
    6960             : 
    6961           0 :         if (sample_type & PERF_SAMPLE_CGROUP)
    6962           0 :                 perf_output_put(handle, data->cgroup);
    6963             : 
    6964           0 :         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
    6965           0 :                 perf_output_put(handle, data->data_page_size);
    6966             : 
    6967           0 :         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
    6968           0 :                 perf_output_put(handle, data->code_page_size);
    6969             : 
    6970           0 :         if (sample_type & PERF_SAMPLE_AUX) {
    6971           0 :                 perf_output_put(handle, data->aux_size);
    6972             : 
    6973           0 :                 if (data->aux_size)
    6974           0 :                         perf_aux_sample_output(event, handle, data);
    6975             :         }
    6976             : 
    6977           0 :         if (!event->attr.watermark) {
    6978           0 :                 int wakeup_events = event->attr.wakeup_events;
    6979             : 
    6980           0 :                 if (wakeup_events) {
    6981           0 :                         struct perf_buffer *rb = handle->rb;
    6982           0 :                         int events = local_inc_return(&rb->events);
    6983             : 
    6984           0 :                         if (events >= wakeup_events) {
    6985           0 :                                 local_sub(wakeup_events, &rb->events);
    6986           0 :                                 local_inc(&rb->wakeup);
    6987             :                         }
    6988             :                 }
    6989             :         }
    6990           0 : }
    6991             : 
    6992           0 : static u64 perf_virt_to_phys(u64 virt)
    6993             : {
    6994           0 :         u64 phys_addr = 0;
    6995           0 :         struct page *p = NULL;
    6996             : 
    6997           0 :         if (!virt)
    6998             :                 return 0;
    6999             : 
    7000           0 :         if (virt >= TASK_SIZE) {
    7001             :                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
    7002           0 :                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
    7003           0 :                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
    7004           0 :                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
    7005             :         } else {
    7006             :                 /*
    7007             :                  * Walking the pages tables for user address.
    7008             :                  * Interrupts are disabled, so it prevents any tear down
    7009             :                  * of the page tables.
    7010             :                  * Try IRQ-safe get_user_page_fast_only first.
    7011             :                  * If failed, leave phys_addr as 0.
    7012             :                  */
    7013           0 :                 if (current->mm != NULL) {
    7014           0 :                         pagefault_disable();
    7015           0 :                         if (get_user_page_fast_only(virt, 0, &p))
    7016           0 :                                 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
    7017           0 :                         pagefault_enable();
    7018             :                 }
    7019             : 
    7020           0 :                 if (p)
    7021           0 :                         put_page(p);
    7022             :         }
    7023             : 
    7024             :         return phys_addr;
    7025             : }
    7026             : 
    7027             : /*
    7028             :  * Return the pagetable size of a given virtual address.
    7029             :  */
    7030           0 : static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
    7031             : {
    7032           0 :         u64 size = 0;
    7033             : 
    7034             : #ifdef CONFIG_HAVE_FAST_GUP
    7035           0 :         pgd_t *pgdp, pgd;
    7036           0 :         p4d_t *p4dp, p4d;
    7037           0 :         pud_t *pudp, pud;
    7038           0 :         pmd_t *pmdp, pmd;
    7039           0 :         pte_t *ptep, pte;
    7040             : 
    7041           0 :         pgdp = pgd_offset(mm, addr);
    7042           0 :         pgd = READ_ONCE(*pgdp);
    7043           0 :         if (pgd_none(pgd))
    7044             :                 return 0;
    7045             : 
    7046           0 :         if (pgd_leaf(pgd))
    7047             :                 return pgd_leaf_size(pgd);
    7048             : 
    7049           0 :         p4dp = p4d_offset_lockless(pgdp, pgd, addr);
    7050           0 :         p4d = READ_ONCE(*p4dp);
    7051           0 :         if (!p4d_present(p4d))
    7052             :                 return 0;
    7053             : 
    7054           0 :         if (p4d_leaf(p4d))
    7055             :                 return p4d_leaf_size(p4d);
    7056             : 
    7057           0 :         pudp = pud_offset_lockless(p4dp, p4d, addr);
    7058           0 :         pud = READ_ONCE(*pudp);
    7059           0 :         if (!pud_present(pud))
    7060             :                 return 0;
    7061             : 
    7062           0 :         if (pud_leaf(pud))
    7063             :                 return pud_leaf_size(pud);
    7064             : 
    7065           0 :         pmdp = pmd_offset_lockless(pudp, pud, addr);
    7066           0 :         pmd = READ_ONCE(*pmdp);
    7067           0 :         if (!pmd_present(pmd))
    7068             :                 return 0;
    7069             : 
    7070           0 :         if (pmd_leaf(pmd))
    7071             :                 return pmd_leaf_size(pmd);
    7072             : 
    7073           0 :         ptep = pte_offset_map(&pmd, addr);
    7074           0 :         pte = ptep_get_lockless(ptep);
    7075           0 :         if (pte_present(pte))
    7076           0 :                 size = pte_leaf_size(pte);
    7077             :         pte_unmap(ptep);
    7078             : #endif /* CONFIG_HAVE_FAST_GUP */
    7079             : 
    7080             :         return size;
    7081             : }
    7082             : 
    7083           0 : static u64 perf_get_page_size(unsigned long addr)
    7084             : {
    7085           0 :         struct mm_struct *mm;
    7086           0 :         unsigned long flags;
    7087           0 :         u64 size;
    7088             : 
    7089           0 :         if (!addr)
    7090             :                 return 0;
    7091             : 
    7092             :         /*
    7093             :          * Software page-table walkers must disable IRQs,
    7094             :          * which prevents any tear down of the page tables.
    7095             :          */
    7096           0 :         local_irq_save(flags);
    7097             : 
    7098           0 :         mm = current->mm;
    7099           0 :         if (!mm) {
    7100             :                 /*
    7101             :                  * For kernel threads and the like, use init_mm so that
    7102             :                  * we can find kernel memory.
    7103             :                  */
    7104           0 :                 mm = &init_mm;
    7105             :         }
    7106             : 
    7107           0 :         size = perf_get_pgtable_size(mm, addr);
    7108             : 
    7109           0 :         local_irq_restore(flags);
    7110             : 
    7111             :         return size;
    7112             : }
    7113             : 
    7114             : static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
    7115             : 
    7116             : struct perf_callchain_entry *
    7117           0 : perf_callchain(struct perf_event *event, struct pt_regs *regs)
    7118             : {
    7119           0 :         bool kernel = !event->attr.exclude_callchain_kernel;
    7120           0 :         bool user   = !event->attr.exclude_callchain_user;
    7121             :         /* Disallow cross-task user callchains. */
    7122           0 :         bool crosstask = event->ctx->task && event->ctx->task != current;
    7123           0 :         const u32 max_stack = event->attr.sample_max_stack;
    7124           0 :         struct perf_callchain_entry *callchain;
    7125             : 
    7126           0 :         if (!kernel && !user)
    7127             :                 return &__empty_callchain;
    7128             : 
    7129           0 :         callchain = get_perf_callchain(regs, 0, kernel, user,
    7130             :                                        max_stack, crosstask, true);
    7131           0 :         return callchain ?: &__empty_callchain;
    7132             : }
    7133             : 
    7134           0 : void perf_prepare_sample(struct perf_event_header *header,
    7135             :                          struct perf_sample_data *data,
    7136             :                          struct perf_event *event,
    7137             :                          struct pt_regs *regs)
    7138             : {
    7139           0 :         u64 sample_type = event->attr.sample_type;
    7140             : 
    7141           0 :         header->type = PERF_RECORD_SAMPLE;
    7142           0 :         header->size = sizeof(*header) + event->header_size;
    7143             : 
    7144           0 :         header->misc = 0;
    7145           0 :         header->misc |= perf_misc_flags(regs);
    7146             : 
    7147           0 :         __perf_event_header__init_id(header, data, event);
    7148             : 
    7149           0 :         if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE))
    7150           0 :                 data->ip = perf_instruction_pointer(regs);
    7151             : 
    7152           0 :         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
    7153           0 :                 int size = 1;
    7154             : 
    7155           0 :                 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
    7156           0 :                         data->callchain = perf_callchain(event, regs);
    7157             : 
    7158           0 :                 size += data->callchain->nr;
    7159             : 
    7160           0 :                 header->size += size * sizeof(u64);
    7161             :         }
    7162             : 
    7163           0 :         if (sample_type & PERF_SAMPLE_RAW) {
    7164           0 :                 struct perf_raw_record *raw = data->raw;
    7165           0 :                 int size;
    7166             : 
    7167           0 :                 if (raw) {
    7168           0 :                         struct perf_raw_frag *frag = &raw->frag;
    7169           0 :                         u32 sum = 0;
    7170             : 
    7171           0 :                         do {
    7172           0 :                                 sum += frag->size;
    7173           0 :                                 if (perf_raw_frag_last(frag))
    7174             :                                         break;
    7175           0 :                                 frag = frag->next;
    7176           0 :                         } while (1);
    7177             : 
    7178           0 :                         size = round_up(sum + sizeof(u32), sizeof(u64));
    7179           0 :                         raw->size = size - sizeof(u32);
    7180           0 :                         frag->pad = raw->size - sum;
    7181             :                 } else {
    7182             :                         size = sizeof(u64);
    7183             :                 }
    7184             : 
    7185           0 :                 header->size += size;
    7186             :         }
    7187             : 
    7188           0 :         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
    7189           0 :                 int size = sizeof(u64); /* nr */
    7190           0 :                 if (data->br_stack) {
    7191           0 :                         if (perf_sample_save_hw_index(event))
    7192           0 :                                 size += sizeof(u64);
    7193             : 
    7194           0 :                         size += data->br_stack->nr
    7195             :                               * sizeof(struct perf_branch_entry);
    7196             :                 }
    7197           0 :                 header->size += size;
    7198             :         }
    7199             : 
    7200           0 :         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
    7201           0 :                 perf_sample_regs_user(&data->regs_user, regs);
    7202             : 
    7203           0 :         if (sample_type & PERF_SAMPLE_REGS_USER) {
    7204             :                 /* regs dump ABI info */
    7205           0 :                 int size = sizeof(u64);
    7206             : 
    7207           0 :                 if (data->regs_user.regs) {
    7208           0 :                         u64 mask = event->attr.sample_regs_user;
    7209           0 :                         size += hweight64(mask) * sizeof(u64);
    7210             :                 }
    7211             : 
    7212           0 :                 header->size += size;
    7213             :         }
    7214             : 
    7215           0 :         if (sample_type & PERF_SAMPLE_STACK_USER) {
    7216             :                 /*
    7217             :                  * Either we need PERF_SAMPLE_STACK_USER bit to be always
    7218             :                  * processed as the last one or have additional check added
    7219             :                  * in case new sample type is added, because we could eat
    7220             :                  * up the rest of the sample size.
    7221             :                  */
    7222           0 :                 u16 stack_size = event->attr.sample_stack_user;
    7223           0 :                 u16 size = sizeof(u64);
    7224             : 
    7225           0 :                 stack_size = perf_sample_ustack_size(stack_size, header->size,
    7226             :                                                      data->regs_user.regs);
    7227             : 
    7228             :                 /*
    7229             :                  * If there is something to dump, add space for the dump
    7230             :                  * itself and for the field that tells the dynamic size,
    7231             :                  * which is how many have been actually dumped.
    7232             :                  */
    7233           0 :                 if (stack_size)
    7234           0 :                         size += sizeof(u64) + stack_size;
    7235             : 
    7236           0 :                 data->stack_user_size = stack_size;
    7237           0 :                 header->size += size;
    7238             :         }
    7239             : 
    7240           0 :         if (sample_type & PERF_SAMPLE_REGS_INTR) {
    7241             :                 /* regs dump ABI info */
    7242           0 :                 int size = sizeof(u64);
    7243             : 
    7244           0 :                 perf_sample_regs_intr(&data->regs_intr, regs);
    7245             : 
    7246           0 :                 if (data->regs_intr.regs) {
    7247           0 :                         u64 mask = event->attr.sample_regs_intr;
    7248             : 
    7249           0 :                         size += hweight64(mask) * sizeof(u64);
    7250             :                 }
    7251             : 
    7252           0 :                 header->size += size;
    7253             :         }
    7254             : 
    7255           0 :         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
    7256           0 :                 data->phys_addr = perf_virt_to_phys(data->addr);
    7257             : 
    7258             : #ifdef CONFIG_CGROUP_PERF
    7259             :         if (sample_type & PERF_SAMPLE_CGROUP) {
    7260             :                 struct cgroup *cgrp;
    7261             : 
    7262             :                 /* protected by RCU */
    7263             :                 cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
    7264             :                 data->cgroup = cgroup_id(cgrp);
    7265             :         }
    7266             : #endif
    7267             : 
    7268             :         /*
    7269             :          * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
    7270             :          * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
    7271             :          * but the value will not dump to the userspace.
    7272             :          */
    7273           0 :         if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
    7274           0 :                 data->data_page_size = perf_get_page_size(data->addr);
    7275             : 
    7276           0 :         if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
    7277           0 :                 data->code_page_size = perf_get_page_size(data->ip);
    7278             : 
    7279           0 :         if (sample_type & PERF_SAMPLE_AUX) {
    7280           0 :                 u64 size;
    7281             : 
    7282           0 :                 header->size += sizeof(u64); /* size */
    7283             : 
    7284             :                 /*
    7285             :                  * Given the 16bit nature of header::size, an AUX sample can
    7286             :                  * easily overflow it, what with all the preceding sample bits.
    7287             :                  * Make sure this doesn't happen by using up to U16_MAX bytes
    7288             :                  * per sample in total (rounded down to 8 byte boundary).
    7289             :                  */
    7290           0 :                 size = min_t(size_t, U16_MAX - header->size,
    7291             :                              event->attr.aux_sample_size);
    7292           0 :                 size = rounddown(size, 8);
    7293           0 :                 size = perf_prepare_sample_aux(event, data, size);
    7294             : 
    7295           0 :                 WARN_ON_ONCE(size + header->size > U16_MAX);
    7296           0 :                 header->size += size;
    7297             :         }
    7298             :         /*
    7299             :          * If you're adding more sample types here, you likely need to do
    7300             :          * something about the overflowing header::size, like repurpose the
    7301             :          * lowest 3 bits of size, which should be always zero at the moment.
    7302             :          * This raises a more important question, do we really need 512k sized
    7303             :          * samples and why, so good argumentation is in order for whatever you
    7304             :          * do here next.
    7305             :          */
    7306           0 :         WARN_ON_ONCE(header->size & 7);
    7307           0 : }
    7308             : 
    7309             : static __always_inline int
    7310           0 : __perf_event_output(struct perf_event *event,
    7311             :                     struct perf_sample_data *data,
    7312             :                     struct pt_regs *regs,
    7313             :                     int (*output_begin)(struct perf_output_handle *,
    7314             :                                         struct perf_sample_data *,
    7315             :                                         struct perf_event *,
    7316             :                                         unsigned int))
    7317             : {
    7318           0 :         struct perf_output_handle handle;
    7319           0 :         struct perf_event_header header;
    7320           0 :         int err;
    7321             : 
    7322             :         /* protect the callchain buffers */
    7323           0 :         rcu_read_lock();
    7324             : 
    7325           0 :         perf_prepare_sample(&header, data, event, regs);
    7326             : 
    7327           0 :         err = output_begin(&handle, data, event, header.size);
    7328           0 :         if (err)
    7329           0 :                 goto exit;
    7330             : 
    7331           0 :         perf_output_sample(&handle, &header, data, event);
    7332             : 
    7333           0 :         perf_output_end(&handle);
    7334             : 
    7335           0 : exit:
    7336           0 :         rcu_read_unlock();
    7337           0 :         return err;
    7338             : }
    7339             : 
    7340             : void
    7341           0 : perf_event_output_forward(struct perf_event *event,
    7342             :                          struct perf_sample_data *data,
    7343             :                          struct pt_regs *regs)
    7344             : {
    7345           0 :         __perf_event_output(event, data, regs, perf_output_begin_forward);
    7346           0 : }
    7347             : 
    7348             : void
    7349           0 : perf_event_output_backward(struct perf_event *event,
    7350             :                            struct perf_sample_data *data,
    7351             :                            struct pt_regs *regs)
    7352             : {
    7353           0 :         __perf_event_output(event, data, regs, perf_output_begin_backward);
    7354           0 : }
    7355             : 
    7356             : int
    7357           0 : perf_event_output(struct perf_event *event,
    7358             :                   struct perf_sample_data *data,
    7359             :                   struct pt_regs *regs)
    7360             : {
    7361           0 :         return __perf_event_output(event, data, regs, perf_output_begin);
    7362             : }
    7363             : 
    7364             : /*
    7365             :  * read event_id
    7366             :  */
    7367             : 
    7368             : struct perf_read_event {
    7369             :         struct perf_event_header        header;
    7370             : 
    7371             :         u32                             pid;
    7372             :         u32                             tid;
    7373             : };
    7374             : 
    7375             : static void
    7376           0 : perf_event_read_event(struct perf_event *event,
    7377             :                         struct task_struct *task)
    7378             : {
    7379           0 :         struct perf_output_handle handle;
    7380           0 :         struct perf_sample_data sample;
    7381           0 :         struct perf_read_event read_event = {
    7382             :                 .header = {
    7383             :                         .type = PERF_RECORD_READ,
    7384             :                         .misc = 0,
    7385           0 :                         .size = sizeof(read_event) + event->read_size,
    7386             :                 },
    7387           0 :                 .pid = perf_event_pid(event, task),
    7388           0 :                 .tid = perf_event_tid(event, task),
    7389             :         };
    7390           0 :         int ret;
    7391             : 
    7392           0 :         perf_event_header__init_id(&read_event.header, &sample, event);
    7393           0 :         ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
    7394           0 :         if (ret)
    7395           0 :                 return;
    7396             : 
    7397           0 :         perf_output_put(&handle, read_event);
    7398           0 :         perf_output_read(&handle, event);
    7399           0 :         perf_event__output_id_sample(event, &handle, &sample);
    7400             : 
    7401           0 :         perf_output_end(&handle);
    7402             : }
    7403             : 
    7404             : typedef void (perf_iterate_f)(struct perf_event *event, void *data);
    7405             : 
    7406             : static void
    7407           0 : perf_iterate_ctx(struct perf_event_context *ctx,
    7408             :                    perf_iterate_f output,
    7409             :                    void *data, bool all)
    7410             : {
    7411           0 :         struct perf_event *event;
    7412             : 
    7413           0 :         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
    7414           0 :                 if (!all) {
    7415           0 :                         if (event->state < PERF_EVENT_STATE_INACTIVE)
    7416           0 :                                 continue;
    7417           0 :                         if (!event_filter_match(event))
    7418           0 :                                 continue;
    7419             :                 }
    7420             : 
    7421           0 :                 output(event, data);
    7422             :         }
    7423           0 : }
    7424             : 
    7425           0 : static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
    7426             : {
    7427           0 :         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
    7428           0 :         struct perf_event *event;
    7429             : 
    7430           0 :         list_for_each_entry_rcu(event, &pel->list, sb_list) {
    7431             :                 /*
    7432             :                  * Skip events that are not fully formed yet; ensure that
    7433             :                  * if we observe event->ctx, both event and ctx will be
    7434             :                  * complete enough. See perf_install_in_context().
    7435             :                  */
    7436           0 :                 if (!smp_load_acquire(&event->ctx))
    7437           0 :                         continue;
    7438             : 
    7439           0 :                 if (event->state < PERF_EVENT_STATE_INACTIVE)
    7440           0 :                         continue;
    7441           0 :                 if (!event_filter_match(event))
    7442           0 :                         continue;
    7443           0 :                 output(event, data);
    7444             :         }
    7445           0 : }
    7446             : 
    7447             : /*
    7448             :  * Iterate all events that need to receive side-band events.
    7449             :  *
    7450             :  * For new callers; ensure that account_pmu_sb_event() includes
    7451             :  * your event, otherwise it might not get delivered.
    7452             :  */
    7453             : static void
    7454           0 : perf_iterate_sb(perf_iterate_f output, void *data,
    7455             :                struct perf_event_context *task_ctx)
    7456             : {
    7457           0 :         struct perf_event_context *ctx;
    7458           0 :         int ctxn;
    7459             : 
    7460           0 :         rcu_read_lock();
    7461           0 :         preempt_disable();
    7462             : 
    7463             :         /*
    7464             :          * If we have task_ctx != NULL we only notify the task context itself.
    7465             :          * The task_ctx is set only for EXIT events before releasing task
    7466             :          * context.
    7467             :          */
    7468           0 :         if (task_ctx) {
    7469           0 :                 perf_iterate_ctx(task_ctx, output, data, false);
    7470           0 :                 goto done;
    7471             :         }
    7472             : 
    7473           0 :         perf_iterate_sb_cpu(output, data);
    7474             : 
    7475           0 :         for_each_task_context_nr(ctxn) {
    7476           0 :                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
    7477           0 :                 if (ctx)
    7478           0 :                         perf_iterate_ctx(ctx, output, data, false);
    7479             :         }
    7480           0 : done:
    7481           0 :         preempt_enable();
    7482           0 :         rcu_read_unlock();
    7483           0 : }
    7484             : 
    7485             : /*
    7486             :  * Clear all file-based filters at exec, they'll have to be
    7487             :  * re-instated when/if these objects are mmapped again.
    7488             :  */
    7489           0 : static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
    7490             : {
    7491           0 :         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
    7492           0 :         struct perf_addr_filter *filter;
    7493           0 :         unsigned int restart = 0, count = 0;
    7494           0 :         unsigned long flags;
    7495             : 
    7496           0 :         if (!has_addr_filter(event))
    7497             :                 return;
    7498             : 
    7499           0 :         raw_spin_lock_irqsave(&ifh->lock, flags);
    7500           0 :         list_for_each_entry(filter, &ifh->list, entry) {
    7501           0 :                 if (filter->path.dentry) {
    7502           0 :                         event->addr_filter_ranges[count].start = 0;
    7503           0 :                         event->addr_filter_ranges[count].size = 0;
    7504           0 :                         restart++;
    7505             :                 }
    7506             : 
    7507           0 :                 count++;
    7508             :         }
    7509             : 
    7510           0 :         if (restart)
    7511           0 :                 event->addr_filters_gen++;
    7512           0 :         raw_spin_unlock_irqrestore(&ifh->lock, flags);
    7513             : 
    7514           0 :         if (restart)
    7515           0 :                 perf_event_stop(event, 1);
    7516             : }
    7517             : 
    7518         870 : void perf_event_exec(void)
    7519             : {
    7520         870 :         struct perf_event_context *ctx;
    7521         870 :         int ctxn;
    7522             : 
    7523         870 :         rcu_read_lock();
    7524        2610 :         for_each_task_context_nr(ctxn) {
    7525        1740 :                 ctx = current->perf_event_ctxp[ctxn];
    7526        1740 :                 if (!ctx)
    7527        1740 :                         continue;
    7528             : 
    7529           0 :                 perf_event_enable_on_exec(ctxn);
    7530             : 
    7531           0 :                 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
    7532             :                                    true);
    7533             :         }
    7534         870 :         rcu_read_unlock();
    7535         870 : }
    7536             : 
    7537             : struct remote_output {
    7538             :         struct perf_buffer      *rb;
    7539             :         int                     err;
    7540             : };
    7541             : 
    7542           0 : static void __perf_event_output_stop(struct perf_event *event, void *data)
    7543             : {
    7544           0 :         struct perf_event *parent = event->parent;
    7545           0 :         struct remote_output *ro = data;
    7546           0 :         struct perf_buffer *rb = ro->rb;
    7547           0 :         struct stop_event_data sd = {
    7548             :                 .event  = event,
    7549             :         };
    7550             : 
    7551           0 :         if (!has_aux(event))
    7552           0 :                 return;
    7553             : 
    7554           0 :         if (!parent)
    7555           0 :                 parent = event;
    7556             : 
    7557             :         /*
    7558             :          * In case of inheritance, it will be the parent that links to the
    7559             :          * ring-buffer, but it will be the child that's actually using it.
    7560             :          *
    7561             :          * We are using event::rb to determine if the event should be stopped,
    7562             :          * however this may race with ring_buffer_attach() (through set_output),
    7563             :          * which will make us skip the event that actually needs to be stopped.
    7564             :          * So ring_buffer_attach() has to stop an aux event before re-assigning
    7565             :          * its rb pointer.
    7566             :          */
    7567           0 :         if (rcu_dereference(parent->rb) == rb)
    7568           0 :                 ro->err = __perf_event_stop(&sd);
    7569             : }
    7570             : 
    7571           0 : static int __perf_pmu_output_stop(void *info)
    7572             : {
    7573           0 :         struct perf_event *event = info;
    7574           0 :         struct pmu *pmu = event->ctx->pmu;
    7575           0 :         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
    7576           0 :         struct remote_output ro = {
    7577           0 :                 .rb     = event->rb,
    7578             :         };
    7579             : 
    7580           0 :         rcu_read_lock();
    7581           0 :         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
    7582           0 :         if (cpuctx->task_ctx)
    7583           0 :                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
    7584             :                                    &ro, false);
    7585           0 :         rcu_read_unlock();
    7586             : 
    7587           0 :         return ro.err;
    7588             : }
    7589             : 
    7590           0 : static void perf_pmu_output_stop(struct perf_event *event)
    7591             : {
    7592           0 :         struct perf_event *iter;
    7593           0 :         int err, cpu;
    7594             : 
    7595           0 : restart:
    7596           0 :         rcu_read_lock();
    7597           0 :         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
    7598             :                 /*
    7599             :                  * For per-CPU events, we need to make sure that neither they
    7600             :                  * nor their children are running; for cpu==-1 events it's
    7601             :                  * sufficient to stop the event itself if it's active, since
    7602             :                  * it can't have children.
    7603             :                  */
    7604           0 :                 cpu = iter->cpu;
    7605           0 :                 if (cpu == -1)
    7606           0 :                         cpu = READ_ONCE(iter->oncpu);
    7607             : 
    7608           0 :                 if (cpu == -1)
    7609           0 :                         continue;
    7610             : 
    7611           0 :                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
    7612           0 :                 if (err == -EAGAIN) {
    7613           0 :                         rcu_read_unlock();
    7614           0 :                         goto restart;
    7615             :                 }
    7616             :         }
    7617           0 :         rcu_read_unlock();
    7618           0 : }
    7619             : 
    7620             : /*
    7621             :  * task tracking -- fork/exit
    7622             :  *
    7623             :  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
    7624             :  */
    7625             : 
    7626             : struct perf_task_event {
    7627             :         struct task_struct              *task;
    7628             :         struct perf_event_context       *task_ctx;
    7629             : 
    7630             :         struct {
    7631             :                 struct perf_event_header        header;
    7632             : 
    7633             :                 u32                             pid;
    7634             :                 u32                             ppid;
    7635             :                 u32                             tid;
    7636             :                 u32                             ptid;
    7637             :                 u64                             time;
    7638             :         } event_id;
    7639             : };
    7640             : 
    7641           0 : static int perf_event_task_match(struct perf_event *event)
    7642             : {
    7643           0 :         return event->attr.comm  || event->attr.mmap ||
    7644           0 :                event->attr.mmap2 || event->attr.mmap_data ||
    7645             :                event->attr.task;
    7646             : }
    7647             : 
    7648           0 : static void perf_event_task_output(struct perf_event *event,
    7649             :                                    void *data)
    7650             : {
    7651           0 :         struct perf_task_event *task_event = data;
    7652           0 :         struct perf_output_handle handle;
    7653           0 :         struct perf_sample_data sample;
    7654           0 :         struct task_struct *task = task_event->task;
    7655           0 :         int ret, size = task_event->event_id.header.size;
    7656             : 
    7657           0 :         if (!perf_event_task_match(event))
    7658           0 :                 return;
    7659             : 
    7660           0 :         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
    7661             : 
    7662           0 :         ret = perf_output_begin(&handle, &sample, event,
    7663           0 :                                 task_event->event_id.header.size);
    7664           0 :         if (ret)
    7665           0 :                 goto out;
    7666             : 
    7667           0 :         task_event->event_id.pid = perf_event_pid(event, task);
    7668           0 :         task_event->event_id.tid = perf_event_tid(event, task);
    7669             : 
    7670           0 :         if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
    7671           0 :                 task_event->event_id.ppid = perf_event_pid(event,
    7672             :                                                         task->real_parent);
    7673           0 :                 task_event->event_id.ptid = perf_event_pid(event,
    7674             :                                                         task->real_parent);
    7675             :         } else {  /* PERF_RECORD_FORK */
    7676           0 :                 task_event->event_id.ppid = perf_event_pid(event, current);
    7677           0 :                 task_event->event_id.ptid = perf_event_tid(event, current);
    7678             :         }
    7679             : 
    7680           0 :         task_event->event_id.time = perf_event_clock(event);
    7681             : 
    7682           0 :         perf_output_put(&handle, task_event->event_id);
    7683             : 
    7684           0 :         perf_event__output_id_sample(event, &handle, &sample);
    7685             : 
    7686           0 :         perf_output_end(&handle);
    7687           0 : out:
    7688           0 :         task_event->event_id.header.size = size;
    7689             : }
    7690             : 
    7691        2389 : static void perf_event_task(struct task_struct *task,
    7692             :                               struct perf_event_context *task_ctx,
    7693             :                               int new)
    7694             : {
    7695        2389 :         struct perf_task_event task_event;
    7696             : 
    7697        2389 :         if (!atomic_read(&nr_comm_events) &&
    7698        2389 :             !atomic_read(&nr_mmap_events) &&
    7699        2389 :             !atomic_read(&nr_task_events))
    7700        2389 :                 return;
    7701             : 
    7702           0 :         task_event = (struct perf_task_event){
    7703             :                 .task     = task,
    7704             :                 .task_ctx = task_ctx,
    7705             :                 .event_id    = {
    7706             :                         .header = {
    7707           0 :                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
    7708             :                                 .misc = 0,
    7709             :                                 .size = sizeof(task_event.event_id),
    7710             :                         },
    7711             :                         /* .pid  */
    7712             :                         /* .ppid */
    7713             :                         /* .tid  */
    7714             :                         /* .ptid */
    7715             :                         /* .time */
    7716             :                 },
    7717             :         };
    7718             : 
    7719           0 :         perf_iterate_sb(perf_event_task_output,
    7720             :                        &task_event,
    7721             :                        task_ctx);
    7722             : }
    7723             : 
    7724        1234 : void perf_event_fork(struct task_struct *task)
    7725             : {
    7726        1234 :         perf_event_task(task, NULL, 1);
    7727        1234 :         perf_event_namespaces(task);
    7728        1234 : }
    7729             : 
    7730             : /*
    7731             :  * comm tracking
    7732             :  */
    7733             : 
    7734             : struct perf_comm_event {
    7735             :         struct task_struct      *task;
    7736             :         char                    *comm;
    7737             :         int                     comm_size;
    7738             : 
    7739             :         struct {
    7740             :                 struct perf_event_header        header;
    7741             : 
    7742             :                 u32                             pid;
    7743             :                 u32                             tid;
    7744             :         } event_id;
    7745             : };
    7746             : 
    7747           0 : static int perf_event_comm_match(struct perf_event *event)
    7748             : {
    7749           0 :         return event->attr.comm;
    7750             : }
    7751             : 
    7752           0 : static void perf_event_comm_output(struct perf_event *event,
    7753             :                                    void *data)
    7754             : {
    7755           0 :         struct perf_comm_event *comm_event = data;
    7756           0 :         struct perf_output_handle handle;
    7757           0 :         struct perf_sample_data sample;
    7758           0 :         int size = comm_event->event_id.header.size;
    7759           0 :         int ret;
    7760             : 
    7761           0 :         if (!perf_event_comm_match(event))
    7762           0 :                 return;
    7763             : 
    7764           0 :         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
    7765           0 :         ret = perf_output_begin(&handle, &sample, event,
    7766           0 :                                 comm_event->event_id.header.size);
    7767             : 
    7768           0 :         if (ret)
    7769           0 :                 goto out;
    7770             : 
    7771           0 :         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
    7772           0 :         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
    7773             : 
    7774           0 :         perf_output_put(&handle, comm_event->event_id);
    7775           0 :         __output_copy(&handle, comm_event->comm,
    7776           0 :                                    comm_event->comm_size);
    7777             : 
    7778           0 :         perf_event__output_id_sample(event, &handle, &sample);
    7779             : 
    7780           0 :         perf_output_end(&handle);
    7781           0 : out:
    7782           0 :         comm_event->event_id.header.size = size;
    7783             : }
    7784             : 
    7785           0 : static void perf_event_comm_event(struct perf_comm_event *comm_event)
    7786             : {
    7787           0 :         char comm[TASK_COMM_LEN];
    7788           0 :         unsigned int size;
    7789             : 
    7790           0 :         memset(comm, 0, sizeof(comm));
    7791           0 :         strlcpy(comm, comm_event->task->comm, sizeof(comm));
    7792           0 :         size = ALIGN(strlen(comm)+1, sizeof(u64));
    7793             : 
    7794           0 :         comm_event->comm = comm;
    7795           0 :         comm_event->comm_size = size;
    7796             : 
    7797           0 :         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
    7798             : 
    7799           0 :         perf_iterate_sb(perf_event_comm_output,
    7800             :                        comm_event,
    7801             :                        NULL);
    7802           0 : }
    7803             : 
    7804         995 : void perf_event_comm(struct task_struct *task, bool exec)
    7805             : {
    7806         995 :         struct perf_comm_event comm_event;
    7807             : 
    7808         995 :         if (!atomic_read(&nr_comm_events))
    7809         995 :                 return;
    7810             : 
    7811           0 :         comm_event = (struct perf_comm_event){
    7812             :                 .task   = task,
    7813             :                 /* .comm      */
    7814             :                 /* .comm_size */
    7815             :                 .event_id  = {
    7816             :                         .header = {
    7817             :                                 .type = PERF_RECORD_COMM,
    7818             :                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
    7819             :                                 /* .size */
    7820             :                         },
    7821             :                         /* .pid */
    7822             :                         /* .tid */
    7823             :                 },
    7824             :         };
    7825             : 
    7826           0 :         perf_event_comm_event(&comm_event);
    7827             : }
    7828             : 
    7829             : /*
    7830             :  * namespaces tracking
    7831             :  */
    7832             : 
    7833             : struct perf_namespaces_event {
    7834             :         struct task_struct              *task;
    7835             : 
    7836             :         struct {
    7837             :                 struct perf_event_header        header;
    7838             : 
    7839             :                 u32                             pid;
    7840             :                 u32                             tid;
    7841             :                 u64                             nr_namespaces;
    7842             :                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
    7843             :         } event_id;
    7844             : };
    7845             : 
    7846           0 : static int perf_event_namespaces_match(struct perf_event *event)
    7847             : {
    7848           0 :         return event->attr.namespaces;
    7849             : }
    7850             : 
    7851           0 : static void perf_event_namespaces_output(struct perf_event *event,
    7852             :                                          void *data)
    7853             : {
    7854           0 :         struct perf_namespaces_event *namespaces_event = data;
    7855           0 :         struct perf_output_handle handle;
    7856           0 :         struct perf_sample_data sample;
    7857           0 :         u16 header_size = namespaces_event->event_id.header.size;
    7858           0 :         int ret;
    7859             : 
    7860           0 :         if (!perf_event_namespaces_match(event))
    7861           0 :                 return;
    7862             : 
    7863           0 :         perf_event_header__init_id(&namespaces_event->event_id.header,
    7864             :                                    &sample, event);
    7865           0 :         ret = perf_output_begin(&handle, &sample, event,
    7866           0 :                                 namespaces_event->event_id.header.size);
    7867           0 :         if (ret)
    7868           0 :                 goto out;
    7869             : 
    7870           0 :         namespaces_event->event_id.pid = perf_event_pid(event,
    7871             :                                                         namespaces_event->task);
    7872           0 :         namespaces_event->event_id.tid = perf_event_tid(event,
    7873             :                                                         namespaces_event->task);
    7874             : 
    7875           0 :         perf_output_put(&handle, namespaces_event->event_id);
    7876             : 
    7877           0 :         perf_event__output_id_sample(event, &handle, &sample);
    7878             : 
    7879           0 :         perf_output_end(&handle);
    7880           0 : out:
    7881           0 :         namespaces_event->event_id.header.size = header_size;
    7882             : }
    7883             : 
    7884           0 : static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
    7885             :                                    struct task_struct *task,
    7886             :                                    const struct proc_ns_operations *ns_ops)
    7887             : {
    7888           0 :         struct path ns_path;
    7889           0 :         struct inode *ns_inode;
    7890           0 :         int error;
    7891             : 
    7892           0 :         error = ns_get_path(&ns_path, task, ns_ops);
    7893           0 :         if (!error) {
    7894           0 :                 ns_inode = ns_path.dentry->d_inode;
    7895           0 :                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
    7896           0 :                 ns_link_info->ino = ns_inode->i_ino;
    7897           0 :                 path_put(&ns_path);
    7898             :         }
    7899           0 : }
    7900             : 
    7901        1284 : void perf_event_namespaces(struct task_struct *task)
    7902             : {
    7903        1284 :         struct perf_namespaces_event namespaces_event;
    7904        1284 :         struct perf_ns_link_info *ns_link_info;
    7905             : 
    7906        1284 :         if (!atomic_read(&nr_namespaces_events))
    7907        1284 :                 return;
    7908             : 
    7909           0 :         namespaces_event = (struct perf_namespaces_event){
    7910             :                 .task   = task,
    7911             :                 .event_id  = {
    7912             :                         .header = {
    7913             :                                 .type = PERF_RECORD_NAMESPACES,
    7914             :                                 .misc = 0,
    7915             :                                 .size = sizeof(namespaces_event.event_id),
    7916             :                         },
    7917             :                         /* .pid */
    7918             :                         /* .tid */
    7919             :                         .nr_namespaces = NR_NAMESPACES,
    7920             :                         /* .link_info[NR_NAMESPACES] */
    7921             :                 },
    7922             :         };
    7923             : 
    7924           0 :         ns_link_info = namespaces_event.event_id.link_info;
    7925             : 
    7926           0 :         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
    7927             :                                task, &mntns_operations);
    7928             : 
    7929             : #ifdef CONFIG_USER_NS
    7930             :         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
    7931             :                                task, &userns_operations);
    7932             : #endif
    7933             : #ifdef CONFIG_NET_NS
    7934             :         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
    7935             :                                task, &netns_operations);
    7936             : #endif
    7937             : #ifdef CONFIG_UTS_NS
    7938             :         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
    7939             :                                task, &utsns_operations);
    7940             : #endif
    7941             : #ifdef CONFIG_IPC_NS
    7942             :         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
    7943             :                                task, &ipcns_operations);
    7944             : #endif
    7945             : #ifdef CONFIG_PID_NS
    7946             :         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
    7947             :                                task, &pidns_operations);
    7948             : #endif
    7949             : #ifdef CONFIG_CGROUPS
    7950           0 :         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
    7951             :                                task, &cgroupns_operations);
    7952             : #endif
    7953             : 
    7954           0 :         perf_iterate_sb(perf_event_namespaces_output,
    7955             :                         &namespaces_event,
    7956             :                         NULL);
    7957             : }
    7958             : 
    7959             : /*
    7960             :  * cgroup tracking
    7961             :  */
    7962             : #ifdef CONFIG_CGROUP_PERF
    7963             : 
    7964             : struct perf_cgroup_event {
    7965             :         char                            *path;
    7966             :         int                             path_size;
    7967             :         struct {
    7968             :                 struct perf_event_header        header;
    7969             :                 u64                             id;
    7970             :                 char                            path[];
    7971             :         } event_id;
    7972             : };
    7973             : 
    7974             : static int perf_event_cgroup_match(struct perf_event *event)
    7975             : {
    7976             :         return event->attr.cgroup;
    7977             : }
    7978             : 
    7979             : static void perf_event_cgroup_output(struct perf_event *event, void *data)
    7980             : {
    7981             :         struct perf_cgroup_event *cgroup_event = data;
    7982             :         struct perf_output_handle handle;
    7983             :         struct perf_sample_data sample;
    7984             :         u16 header_size = cgroup_event->event_id.header.size;
    7985             :         int ret;
    7986             : 
    7987             :         if (!perf_event_cgroup_match(event))
    7988             :                 return;
    7989             : 
    7990             :         perf_event_header__init_id(&cgroup_event->event_id.header,
    7991             :                                    &sample, event);
    7992             :         ret = perf_output_begin(&handle, &sample, event,
    7993             :                                 cgroup_event->event_id.header.size);
    7994             :         if (ret)
    7995             :                 goto out;
    7996             : 
    7997             :         perf_output_put(&handle, cgroup_event->event_id);
    7998             :         __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);
    7999             : 
    8000             :         perf_event__output_id_sample(event, &handle, &sample);
    8001             : 
    8002             :         perf_output_end(&handle);
    8003             : out:
    8004             :         cgroup_event->event_id.header.size = header_size;
    8005             : }
    8006             : 
    8007             : static void perf_event_cgroup(struct cgroup *cgrp)
    8008             : {
    8009             :         struct perf_cgroup_event cgroup_event;
    8010             :         char path_enomem[16] = "//enomem";
    8011             :         char *pathname;
    8012             :         size_t size;
    8013             : 
    8014             :         if (!atomic_read(&nr_cgroup_events))
    8015             :                 return;
    8016             : 
    8017             :         cgroup_event = (struct perf_cgroup_event){
    8018             :                 .event_id  = {
    8019             :                         .header = {
    8020             :                                 .type = PERF_RECORD_CGROUP,
    8021             :                                 .misc = 0,
    8022             :                                 .size = sizeof(cgroup_event.event_id),
    8023             :                         },
    8024             :                         .id = cgroup_id(cgrp),
    8025             :                 },
    8026             :         };
    8027             : 
    8028             :         pathname = kmalloc(PATH_MAX, GFP_KERNEL);
    8029             :         if (pathname == NULL) {
    8030             :                 cgroup_event.path = path_enomem;
    8031             :         } else {
    8032             :                 /* just to be sure to have enough space for alignment */
    8033             :                 cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
    8034             :                 cgroup_event.path = pathname;
    8035             :         }
    8036             : 
    8037             :         /*
    8038             :          * Since our buffer works in 8 byte units we need to align our string
    8039             :          * size to a multiple of 8. However, we must guarantee the tail end is
    8040             :          * zero'd out to avoid leaking random bits to userspace.
    8041             :          */
    8042             :         size = strlen(cgroup_event.path) + 1;
    8043             :         while (!IS_ALIGNED(size, sizeof(u64)))
    8044             :                 cgroup_event.path[size++] = '\0';
    8045             : 
    8046             :         cgroup_event.event_id.header.size += size;
    8047             :         cgroup_event.path_size = size;
    8048             : 
    8049             :         perf_iterate_sb(perf_event_cgroup_output,
    8050             :                         &cgroup_event,
    8051             :                         NULL);
    8052             : 
    8053             :         kfree(pathname);
    8054             : }
    8055             : 
    8056             : #endif
    8057             : 
    8058             : /*
    8059             :  * mmap tracking
    8060             :  */
    8061             : 
    8062             : struct perf_mmap_event {
    8063             :         struct vm_area_struct   *vma;
    8064             : 
    8065             :         const char              *file_name;
    8066             :         int                     file_size;
    8067             :         int                     maj, min;
    8068             :         u64                     ino;
    8069             :         u64                     ino_generation;
    8070             :         u32                     prot, flags;
    8071             :         u8                      build_id[BUILD_ID_SIZE_MAX];
    8072             :         u32                     build_id_size;
    8073             : 
    8074             :         struct {
    8075             :                 struct perf_event_header        header;
    8076             : 
    8077             :                 u32                             pid;
    8078             :                 u32                             tid;
    8079             :                 u64                             start;
    8080             :                 u64                             len;
    8081             :                 u64                             pgoff;
    8082             :         } event_id;
    8083             : };
    8084             : 
    8085           0 : static int perf_event_mmap_match(struct perf_event *event,
    8086             :                                  void *data)
    8087             : {
    8088           0 :         struct perf_mmap_event *mmap_event = data;
    8089           0 :         struct vm_area_struct *vma = mmap_event->vma;
    8090           0 :         int executable = vma->vm_flags & VM_EXEC;
    8091             : 
    8092           0 :         return (!executable && event->attr.mmap_data) ||
    8093           0 :                (executable && (event->attr.mmap || event->attr.mmap2));
    8094             : }
    8095             : 
    8096           0 : static void perf_event_mmap_output(struct perf_event *event,
    8097             :                                    void *data)
    8098             : {
    8099           0 :         struct perf_mmap_event *mmap_event = data;
    8100           0 :         struct perf_output_handle handle;
    8101           0 :         struct perf_sample_data sample;
    8102           0 :         int size = mmap_event->event_id.header.size;
    8103           0 :         u32 type = mmap_event->event_id.header.type;
    8104           0 :         bool use_build_id;
    8105           0 :         int ret;
    8106             : 
    8107           0 :         if (!perf_event_mmap_match(event, data))
    8108           0 :                 return;
    8109             : 
    8110           0 :         if (event->attr.mmap2) {
    8111           0 :                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
    8112           0 :                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
    8113           0 :                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
    8114           0 :                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
    8115           0 :                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
    8116           0 :                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
    8117           0 :                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
    8118             :         }
    8119             : 
    8120           0 :         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
    8121           0 :         ret = perf_output_begin(&handle, &sample, event,
    8122           0 :                                 mmap_event->event_id.header.size);
    8123           0 :         if (ret)
    8124           0 :                 goto out;
    8125             : 
    8126           0 :         mmap_event->event_id.pid = perf_event_pid(event, current);
    8127           0 :         mmap_event->event_id.tid = perf_event_tid(event, current);
    8128             : 
    8129           0 :         use_build_id = event->attr.build_id && mmap_event->build_id_size;
    8130             : 
    8131           0 :         if (event->attr.mmap2 && use_build_id)
    8132           0 :                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;
    8133             : 
    8134           0 :         perf_output_put(&handle, mmap_event->event_id);
    8135             : 
    8136           0 :         if (event->attr.mmap2) {
    8137           0 :                 if (use_build_id) {
    8138           0 :                         u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };
    8139             : 
    8140           0 :                         __output_copy(&handle, size, 4);
    8141           0 :                         __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
    8142             :                 } else {
    8143           0 :                         perf_output_put(&handle, mmap_event->maj);
    8144           0 :                         perf_output_put(&handle, mmap_event->min);
    8145           0 :                         perf_output_put(&handle, mmap_event->ino);
    8146           0 :                         perf_output_put(&handle, mmap_event->ino_generation);
    8147             :                 }
    8148           0 :                 perf_output_put(&handle, mmap_event->prot);
    8149           0 :                 perf_output_put(&handle, mmap_event->flags);
    8150             :         }
    8151             : 
    8152           0 :         __output_copy(&handle, mmap_event->file_name,
    8153           0 :                                    mmap_event->file_size);
    8154             : 
    8155           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8156             : 
    8157           0 :         perf_output_end(&handle);
    8158           0 : out:
    8159           0 :         mmap_event->event_id.header.size = size;
    8160           0 :         mmap_event->event_id.header.type = type;
    8161             : }
    8162             : 
    8163           0 : static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
    8164             : {
    8165           0 :         struct vm_area_struct *vma = mmap_event->vma;
    8166           0 :         struct file *file = vma->vm_file;
    8167           0 :         int maj = 0, min = 0;
    8168           0 :         u64 ino = 0, gen = 0;
    8169           0 :         u32 prot = 0, flags = 0;
    8170           0 :         unsigned int size;
    8171           0 :         char tmp[16];
    8172           0 :         char *buf = NULL;
    8173           0 :         char *name;
    8174             : 
    8175           0 :         if (vma->vm_flags & VM_READ)
    8176             :                 prot |= PROT_READ;
    8177           0 :         if (vma->vm_flags & VM_WRITE)
    8178           0 :                 prot |= PROT_WRITE;
    8179           0 :         if (vma->vm_flags & VM_EXEC)
    8180           0 :                 prot |= PROT_EXEC;
    8181             : 
    8182           0 :         if (vma->vm_flags & VM_MAYSHARE)
    8183             :                 flags = MAP_SHARED;
    8184             :         else
    8185           0 :                 flags = MAP_PRIVATE;
    8186             : 
    8187           0 :         if (vma->vm_flags & VM_DENYWRITE)
    8188           0 :                 flags |= MAP_DENYWRITE;
    8189           0 :         if (vma->vm_flags & VM_MAYEXEC)
    8190           0 :                 flags |= MAP_EXECUTABLE;
    8191           0 :         if (vma->vm_flags & VM_LOCKED)
    8192           0 :                 flags |= MAP_LOCKED;
    8193           0 :         if (is_vm_hugetlb_page(vma))
    8194             :                 flags |= MAP_HUGETLB;
    8195             : 
    8196           0 :         if (file) {
    8197           0 :                 struct inode *inode;
    8198           0 :                 dev_t dev;
    8199             : 
    8200           0 :                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
    8201           0 :                 if (!buf) {
    8202           0 :                         name = "//enomem";
    8203           0 :                         goto cpy_name;
    8204             :                 }
    8205             :                 /*
    8206             :                  * d_path() works from the end of the rb backwards, so we
    8207             :                  * need to add enough zero bytes after the string to handle
    8208             :                  * the 64bit alignment we do later.
    8209             :                  */
    8210           0 :                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
    8211           0 :                 if (IS_ERR(name)) {
    8212           0 :                         name = "//toolong";
    8213           0 :                         goto cpy_name;
    8214             :                 }
    8215           0 :                 inode = file_inode(vma->vm_file);
    8216           0 :                 dev = inode->i_sb->s_dev;
    8217           0 :                 ino = inode->i_ino;
    8218           0 :                 gen = inode->i_generation;
    8219           0 :                 maj = MAJOR(dev);
    8220           0 :                 min = MINOR(dev);
    8221             : 
    8222           0 :                 goto got_name;
    8223             :         } else {
    8224           0 :                 if (vma->vm_ops && vma->vm_ops->name) {
    8225           0 :                         name = (char *) vma->vm_ops->name(vma);
    8226           0 :                         if (name)
    8227           0 :                                 goto cpy_name;
    8228             :                 }
    8229             : 
    8230           0 :                 name = (char *)arch_vma_name(vma);
    8231           0 :                 if (name)
    8232           0 :                         goto cpy_name;
    8233             : 
    8234           0 :                 if (vma->vm_start <= vma->vm_mm->start_brk &&
    8235           0 :                                 vma->vm_end >= vma->vm_mm->brk) {
    8236           0 :                         name = "[heap]";
    8237           0 :                         goto cpy_name;
    8238             :                 }
    8239           0 :                 if (vma->vm_start <= vma->vm_mm->start_stack &&
    8240           0 :                                 vma->vm_end >= vma->vm_mm->start_stack) {
    8241           0 :                         name = "[stack]";
    8242           0 :                         goto cpy_name;
    8243             :                 }
    8244             : 
    8245           0 :                 name = "//anon";
    8246           0 :                 goto cpy_name;
    8247             :         }
    8248             : 
    8249           0 : cpy_name:
    8250           0 :         strlcpy(tmp, name, sizeof(tmp));
    8251           0 :         name = tmp;
    8252           0 : got_name:
    8253             :         /*
    8254             :          * Since our buffer works in 8 byte units we need to align our string
    8255             :          * size to a multiple of 8. However, we must guarantee the tail end is
    8256             :          * zero'd out to avoid leaking random bits to userspace.
    8257             :          */
    8258           0 :         size = strlen(name)+1;
    8259           0 :         while (!IS_ALIGNED(size, sizeof(u64)))
    8260           0 :                 name[size++] = '\0';
    8261             : 
    8262           0 :         mmap_event->file_name = name;
    8263           0 :         mmap_event->file_size = size;
    8264           0 :         mmap_event->maj = maj;
    8265           0 :         mmap_event->min = min;
    8266           0 :         mmap_event->ino = ino;
    8267           0 :         mmap_event->ino_generation = gen;
    8268           0 :         mmap_event->prot = prot;
    8269           0 :         mmap_event->flags = flags;
    8270             : 
    8271           0 :         if (!(vma->vm_flags & VM_EXEC))
    8272           0 :                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
    8273             : 
    8274           0 :         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
    8275             : 
    8276           0 :         if (atomic_read(&nr_build_id_events))
    8277           0 :                 build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size);
    8278             : 
    8279           0 :         perf_iterate_sb(perf_event_mmap_output,
    8280             :                        mmap_event,
    8281             :                        NULL);
    8282             : 
    8283           0 :         kfree(buf);
    8284           0 : }
    8285             : 
    8286             : /*
    8287             :  * Check whether inode and address range match filter criteria.
    8288             :  */
    8289           0 : static bool perf_addr_filter_match(struct perf_addr_filter *filter,
    8290             :                                      struct file *file, unsigned long offset,
    8291             :                                      unsigned long size)
    8292             : {
    8293             :         /* d_inode(NULL) won't be equal to any mapped user-space file */
    8294           0 :         if (!filter->path.dentry)
    8295             :                 return false;
    8296             : 
    8297           0 :         if (d_inode(filter->path.dentry) != file_inode(file))
    8298             :                 return false;
    8299             : 
    8300           0 :         if (filter->offset > offset + size)
    8301             :                 return false;
    8302             : 
    8303           0 :         if (filter->offset + filter->size < offset)
    8304           0 :                 return false;
    8305             : 
    8306             :         return true;
    8307             : }
    8308             : 
    8309           0 : static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
    8310             :                                         struct vm_area_struct *vma,
    8311             :                                         struct perf_addr_filter_range *fr)
    8312             : {
    8313           0 :         unsigned long vma_size = vma->vm_end - vma->vm_start;
    8314           0 :         unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
    8315           0 :         struct file *file = vma->vm_file;
    8316             : 
    8317           0 :         if (!perf_addr_filter_match(filter, file, off, vma_size))
    8318             :                 return false;
    8319             : 
    8320           0 :         if (filter->offset < off) {
    8321           0 :                 fr->start = vma->vm_start;
    8322           0 :                 fr->size = min(vma_size, filter->size - (off - filter->offset));
    8323             :         } else {
    8324           0 :                 fr->start = vma->vm_start + filter->offset - off;
    8325           0 :                 fr->size = min(vma->vm_end - fr->start, filter->size);
    8326             :         }
    8327             : 
    8328             :         return true;
    8329             : }
    8330             : 
    8331           0 : static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
    8332             : {
    8333           0 :         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
    8334           0 :         struct vm_area_struct *vma = data;
    8335           0 :         struct perf_addr_filter *filter;
    8336           0 :         unsigned int restart = 0, count = 0;
    8337           0 :         unsigned long flags;
    8338             : 
    8339           0 :         if (!has_addr_filter(event))
    8340             :                 return;
    8341             : 
    8342           0 :         if (!vma->vm_file)
    8343             :                 return;
    8344             : 
    8345           0 :         raw_spin_lock_irqsave(&ifh->lock, flags);
    8346           0 :         list_for_each_entry(filter, &ifh->list, entry) {
    8347           0 :                 if (perf_addr_filter_vma_adjust(filter, vma,
    8348           0 :                                                 &event->addr_filter_ranges[count]))
    8349           0 :                         restart++;
    8350             : 
    8351           0 :                 count++;
    8352             :         }
    8353             : 
    8354           0 :         if (restart)
    8355           0 :                 event->addr_filters_gen++;
    8356           0 :         raw_spin_unlock_irqrestore(&ifh->lock, flags);
    8357             : 
    8358           0 :         if (restart)
    8359           0 :                 perf_event_stop(event, 1);
    8360             : }
    8361             : 
    8362             : /*
    8363             :  * Adjust all task's events' filters to the new vma
    8364             :  */
    8365           0 : static void perf_addr_filters_adjust(struct vm_area_struct *vma)
    8366             : {
    8367           0 :         struct perf_event_context *ctx;
    8368           0 :         int ctxn;
    8369             : 
    8370             :         /*
    8371             :          * Data tracing isn't supported yet and as such there is no need
    8372             :          * to keep track of anything that isn't related to executable code:
    8373             :          */
    8374           0 :         if (!(vma->vm_flags & VM_EXEC))
    8375             :                 return;
    8376             : 
    8377           0 :         rcu_read_lock();
    8378           0 :         for_each_task_context_nr(ctxn) {
    8379           0 :                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
    8380           0 :                 if (!ctx)
    8381           0 :                         continue;
    8382             : 
    8383           0 :                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
    8384             :         }
    8385           0 :         rcu_read_unlock();
    8386             : }
    8387             : 
    8388       40323 : void perf_event_mmap(struct vm_area_struct *vma)
    8389             : {
    8390       40323 :         struct perf_mmap_event mmap_event;
    8391             : 
    8392       40323 :         if (!atomic_read(&nr_mmap_events))
    8393       40322 :                 return;
    8394             : 
    8395           0 :         mmap_event = (struct perf_mmap_event){
    8396             :                 .vma    = vma,
    8397             :                 /* .file_name */
    8398             :                 /* .file_size */
    8399             :                 .event_id  = {
    8400             :                         .header = {
    8401             :                                 .type = PERF_RECORD_MMAP,
    8402             :                                 .misc = PERF_RECORD_MISC_USER,
    8403             :                                 /* .size */
    8404             :                         },
    8405             :                         /* .pid */
    8406             :                         /* .tid */
    8407           0 :                         .start  = vma->vm_start,
    8408           0 :                         .len    = vma->vm_end - vma->vm_start,
    8409           0 :                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
    8410             :                 },
    8411             :                 /* .maj (attr_mmap2 only) */
    8412             :                 /* .min (attr_mmap2 only) */
    8413             :                 /* .ino (attr_mmap2 only) */
    8414             :                 /* .ino_generation (attr_mmap2 only) */
    8415             :                 /* .prot (attr_mmap2 only) */
    8416             :                 /* .flags (attr_mmap2 only) */
    8417             :         };
    8418             : 
    8419           0 :         perf_addr_filters_adjust(vma);
    8420           0 :         perf_event_mmap_event(&mmap_event);
    8421             : }
    8422             : 
    8423           0 : void perf_event_aux_event(struct perf_event *event, unsigned long head,
    8424             :                           unsigned long size, u64 flags)
    8425             : {
    8426           0 :         struct perf_output_handle handle;
    8427           0 :         struct perf_sample_data sample;
    8428           0 :         struct perf_aux_event {
    8429             :                 struct perf_event_header        header;
    8430             :                 u64                             offset;
    8431             :                 u64                             size;
    8432             :                 u64                             flags;
    8433           0 :         } rec = {
    8434             :                 .header = {
    8435             :                         .type = PERF_RECORD_AUX,
    8436             :                         .misc = 0,
    8437             :                         .size = sizeof(rec),
    8438             :                 },
    8439             :                 .offset         = head,
    8440             :                 .size           = size,
    8441             :                 .flags          = flags,
    8442             :         };
    8443           0 :         int ret;
    8444             : 
    8445           0 :         perf_event_header__init_id(&rec.header, &sample, event);
    8446           0 :         ret = perf_output_begin(&handle, &sample, event, rec.header.size);
    8447             : 
    8448           0 :         if (ret)
    8449           0 :                 return;
    8450             : 
    8451           0 :         perf_output_put(&handle, rec);
    8452           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8453             : 
    8454           0 :         perf_output_end(&handle);
    8455             : }
    8456             : 
    8457             : /*
    8458             :  * Lost/dropped samples logging
    8459             :  */
    8460           0 : void perf_log_lost_samples(struct perf_event *event, u64 lost)
    8461             : {
    8462           0 :         struct perf_output_handle handle;
    8463           0 :         struct perf_sample_data sample;
    8464           0 :         int ret;
    8465             : 
    8466           0 :         struct {
    8467             :                 struct perf_event_header        header;
    8468             :                 u64                             lost;
    8469           0 :         } lost_samples_event = {
    8470             :                 .header = {
    8471             :                         .type = PERF_RECORD_LOST_SAMPLES,
    8472             :                         .misc = 0,
    8473             :                         .size = sizeof(lost_samples_event),
    8474             :                 },
    8475             :                 .lost           = lost,
    8476             :         };
    8477             : 
    8478           0 :         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
    8479             : 
    8480           0 :         ret = perf_output_begin(&handle, &sample, event,
    8481           0 :                                 lost_samples_event.header.size);
    8482           0 :         if (ret)
    8483           0 :                 return;
    8484             : 
    8485           0 :         perf_output_put(&handle, lost_samples_event);
    8486           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8487           0 :         perf_output_end(&handle);
    8488             : }
    8489             : 
    8490             : /*
    8491             :  * context_switch tracking
    8492             :  */
    8493             : 
    8494             : struct perf_switch_event {
    8495             :         struct task_struct      *task;
    8496             :         struct task_struct      *next_prev;
    8497             : 
    8498             :         struct {
    8499             :                 struct perf_event_header        header;
    8500             :                 u32                             next_prev_pid;
    8501             :                 u32                             next_prev_tid;
    8502             :         } event_id;
    8503             : };
    8504             : 
    8505           0 : static int perf_event_switch_match(struct perf_event *event)
    8506             : {
    8507           0 :         return event->attr.context_switch;
    8508             : }
    8509             : 
    8510           0 : static void perf_event_switch_output(struct perf_event *event, void *data)
    8511             : {
    8512           0 :         struct perf_switch_event *se = data;
    8513           0 :         struct perf_output_handle handle;
    8514           0 :         struct perf_sample_data sample;
    8515           0 :         int ret;
    8516             : 
    8517           0 :         if (!perf_event_switch_match(event))
    8518           0 :                 return;
    8519             : 
    8520             :         /* Only CPU-wide events are allowed to see next/prev pid/tid */
    8521           0 :         if (event->ctx->task) {
    8522           0 :                 se->event_id.header.type = PERF_RECORD_SWITCH;
    8523           0 :                 se->event_id.header.size = sizeof(se->event_id.header);
    8524             :         } else {
    8525           0 :                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
    8526           0 :                 se->event_id.header.size = sizeof(se->event_id);
    8527           0 :                 se->event_id.next_prev_pid =
    8528           0 :                                         perf_event_pid(event, se->next_prev);
    8529           0 :                 se->event_id.next_prev_tid =
    8530           0 :                                         perf_event_tid(event, se->next_prev);
    8531             :         }
    8532             : 
    8533           0 :         perf_event_header__init_id(&se->event_id.header, &sample, event);
    8534             : 
    8535           0 :         ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
    8536           0 :         if (ret)
    8537             :                 return;
    8538             : 
    8539           0 :         if (event->ctx->task)
    8540           0 :                 perf_output_put(&handle, se->event_id.header);
    8541             :         else
    8542           0 :                 perf_output_put(&handle, se->event_id);
    8543             : 
    8544           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8545             : 
    8546           0 :         perf_output_end(&handle);
    8547             : }
    8548             : 
    8549           0 : static void perf_event_switch(struct task_struct *task,
    8550             :                               struct task_struct *next_prev, bool sched_in)
    8551             : {
    8552           0 :         struct perf_switch_event switch_event;
    8553             : 
    8554             :         /* N.B. caller checks nr_switch_events != 0 */
    8555             : 
    8556           0 :         switch_event = (struct perf_switch_event){
    8557             :                 .task           = task,
    8558             :                 .next_prev      = next_prev,
    8559             :                 .event_id       = {
    8560             :                         .header = {
    8561             :                                 /* .type */
    8562             :                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
    8563             :                                 /* .size */
    8564             :                         },
    8565             :                         /* .next_prev_pid */
    8566             :                         /* .next_prev_tid */
    8567             :                 },
    8568             :         };
    8569             : 
    8570           0 :         if (!sched_in && task->state == TASK_RUNNING)
    8571           0 :                 switch_event.event_id.header.misc |=
    8572             :                                 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
    8573             : 
    8574           0 :         perf_iterate_sb(perf_event_switch_output,
    8575             :                        &switch_event,
    8576             :                        NULL);
    8577           0 : }
    8578             : 
    8579             : /*
    8580             :  * IRQ throttle logging
    8581             :  */
    8582             : 
    8583           0 : static void perf_log_throttle(struct perf_event *event, int enable)
    8584             : {
    8585           0 :         struct perf_output_handle handle;
    8586           0 :         struct perf_sample_data sample;
    8587           0 :         int ret;
    8588             : 
    8589           0 :         struct {
    8590             :                 struct perf_event_header        header;
    8591             :                 u64                             time;
    8592             :                 u64                             id;
    8593             :                 u64                             stream_id;
    8594           0 :         } throttle_event = {
    8595             :                 .header = {
    8596             :                         .type = PERF_RECORD_THROTTLE,
    8597             :                         .misc = 0,
    8598             :                         .size = sizeof(throttle_event),
    8599             :                 },
    8600           0 :                 .time           = perf_event_clock(event),
    8601           0 :                 .id             = primary_event_id(event),
    8602             :                 .stream_id      = event->id,
    8603             :         };
    8604             : 
    8605           0 :         if (enable)
    8606           0 :                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
    8607             : 
    8608           0 :         perf_event_header__init_id(&throttle_event.header, &sample, event);
    8609             : 
    8610           0 :         ret = perf_output_begin(&handle, &sample, event,
    8611           0 :                                 throttle_event.header.size);
    8612           0 :         if (ret)
    8613           0 :                 return;
    8614             : 
    8615           0 :         perf_output_put(&handle, throttle_event);
    8616           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8617           0 :         perf_output_end(&handle);
    8618             : }
    8619             : 
    8620             : /*
    8621             :  * ksymbol register/unregister tracking
    8622             :  */
    8623             : 
    8624             : struct perf_ksymbol_event {
    8625             :         const char      *name;
    8626             :         int             name_len;
    8627             :         struct {
    8628             :                 struct perf_event_header        header;
    8629             :                 u64                             addr;
    8630             :                 u32                             len;
    8631             :                 u16                             ksym_type;
    8632             :                 u16                             flags;
    8633             :         } event_id;
    8634             : };
    8635             : 
    8636           0 : static int perf_event_ksymbol_match(struct perf_event *event)
    8637             : {
    8638           0 :         return event->attr.ksymbol;
    8639             : }
    8640             : 
    8641           0 : static void perf_event_ksymbol_output(struct perf_event *event, void *data)
    8642             : {
    8643           0 :         struct perf_ksymbol_event *ksymbol_event = data;
    8644           0 :         struct perf_output_handle handle;
    8645           0 :         struct perf_sample_data sample;
    8646           0 :         int ret;
    8647             : 
    8648           0 :         if (!perf_event_ksymbol_match(event))
    8649           0 :                 return;
    8650             : 
    8651           0 :         perf_event_header__init_id(&ksymbol_event->event_id.header,
    8652             :                                    &sample, event);
    8653           0 :         ret = perf_output_begin(&handle, &sample, event,
    8654           0 :                                 ksymbol_event->event_id.header.size);
    8655           0 :         if (ret)
    8656             :                 return;
    8657             : 
    8658           0 :         perf_output_put(&handle, ksymbol_event->event_id);
    8659           0 :         __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
    8660           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8661             : 
    8662           0 :         perf_output_end(&handle);
    8663             : }
    8664             : 
    8665           0 : void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
    8666             :                         const char *sym)
    8667             : {
    8668           0 :         struct perf_ksymbol_event ksymbol_event;
    8669           0 :         char name[KSYM_NAME_LEN];
    8670           0 :         u16 flags = 0;
    8671           0 :         int name_len;
    8672             : 
    8673           0 :         if (!atomic_read(&nr_ksymbol_events))
    8674           0 :                 return;
    8675             : 
    8676           0 :         if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
    8677             :             ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
    8678           0 :                 goto err;
    8679             : 
    8680           0 :         strlcpy(name, sym, KSYM_NAME_LEN);
    8681           0 :         name_len = strlen(name) + 1;
    8682           0 :         while (!IS_ALIGNED(name_len, sizeof(u64)))
    8683           0 :                 name[name_len++] = '\0';
    8684           0 :         BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));
    8685             : 
    8686           0 :         if (unregister)
    8687           0 :                 flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;
    8688             : 
    8689           0 :         ksymbol_event = (struct perf_ksymbol_event){
    8690             :                 .name = name,
    8691             :                 .name_len = name_len,
    8692             :                 .event_id = {
    8693             :                         .header = {
    8694             :                                 .type = PERF_RECORD_KSYMBOL,
    8695           0 :                                 .size = sizeof(ksymbol_event.event_id) +
    8696             :                                         name_len,
    8697             :                         },
    8698             :                         .addr = addr,
    8699             :                         .len = len,
    8700             :                         .ksym_type = ksym_type,
    8701             :                         .flags = flags,
    8702             :                 },
    8703             :         };
    8704             : 
    8705           0 :         perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
    8706           0 :         return;
    8707           0 : err:
    8708           0 :         WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
    8709             : }
    8710             : 
    8711             : /*
    8712             :  * bpf program load/unload tracking
    8713             :  */
    8714             : 
    8715             : struct perf_bpf_event {
    8716             :         struct bpf_prog *prog;
    8717             :         struct {
    8718             :                 struct perf_event_header        header;
    8719             :                 u16                             type;
    8720             :                 u16                             flags;
    8721             :                 u32                             id;
    8722             :                 u8                              tag[BPF_TAG_SIZE];
    8723             :         } event_id;
    8724             : };
    8725             : 
    8726           0 : static int perf_event_bpf_match(struct perf_event *event)
    8727             : {
    8728           0 :         return event->attr.bpf_event;
    8729             : }
    8730             : 
    8731           0 : static void perf_event_bpf_output(struct perf_event *event, void *data)
    8732             : {
    8733           0 :         struct perf_bpf_event *bpf_event = data;
    8734           0 :         struct perf_output_handle handle;
    8735           0 :         struct perf_sample_data sample;
    8736           0 :         int ret;
    8737             : 
    8738           0 :         if (!perf_event_bpf_match(event))
    8739           0 :                 return;
    8740             : 
    8741           0 :         perf_event_header__init_id(&bpf_event->event_id.header,
    8742             :                                    &sample, event);
    8743           0 :         ret = perf_output_begin(&handle, data, event,
    8744           0 :                                 bpf_event->event_id.header.size);
    8745           0 :         if (ret)
    8746             :                 return;
    8747             : 
    8748           0 :         perf_output_put(&handle, bpf_event->event_id);
    8749           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8750             : 
    8751           0 :         perf_output_end(&handle);
    8752             : }
    8753             : 
    8754           0 : static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
    8755             :                                          enum perf_bpf_event_type type)
    8756             : {
    8757           0 :         bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
    8758           0 :         int i;
    8759             : 
    8760           0 :         if (prog->aux->func_cnt == 0) {
    8761           0 :                 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
    8762           0 :                                    (u64)(unsigned long)prog->bpf_func,
    8763             :                                    prog->jited_len, unregister,
    8764           0 :                                    prog->aux->ksym.name);
    8765             :         } else {
    8766           0 :                 for (i = 0; i < prog->aux->func_cnt; i++) {
    8767           0 :                         struct bpf_prog *subprog = prog->aux->func[i];
    8768             : 
    8769           0 :                         perf_event_ksymbol(
    8770             :                                 PERF_RECORD_KSYMBOL_TYPE_BPF,
    8771           0 :                                 (u64)(unsigned long)subprog->bpf_func,
    8772             :                                 subprog->jited_len, unregister,
    8773           0 :                                 prog->aux->ksym.name);
    8774             :                 }
    8775             :         }
    8776           0 : }
    8777             : 
    8778           0 : void perf_event_bpf_event(struct bpf_prog *prog,
    8779             :                           enum perf_bpf_event_type type,
    8780             :                           u16 flags)
    8781             : {
    8782           0 :         struct perf_bpf_event bpf_event;
    8783             : 
    8784           0 :         if (type <= PERF_BPF_EVENT_UNKNOWN ||
    8785             :             type >= PERF_BPF_EVENT_MAX)
    8786           0 :                 return;
    8787             : 
    8788           0 :         switch (type) {
    8789             :         case PERF_BPF_EVENT_PROG_LOAD:
    8790             :         case PERF_BPF_EVENT_PROG_UNLOAD:
    8791           0 :                 if (atomic_read(&nr_ksymbol_events))
    8792           0 :                         perf_event_bpf_emit_ksymbols(prog, type);
    8793             :                 break;
    8794             :         default:
    8795             :                 break;
    8796             :         }
    8797             : 
    8798           0 :         if (!atomic_read(&nr_bpf_events))
    8799             :                 return;
    8800             : 
    8801           0 :         bpf_event = (struct perf_bpf_event){
    8802             :                 .prog = prog,
    8803             :                 .event_id = {
    8804             :                         .header = {
    8805             :                                 .type = PERF_RECORD_BPF_EVENT,
    8806             :                                 .size = sizeof(bpf_event.event_id),
    8807             :                         },
    8808             :                         .type = type,
    8809             :                         .flags = flags,
    8810           0 :                         .id = prog->aux->id,
    8811             :                 },
    8812             :         };
    8813             : 
    8814           0 :         BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));
    8815             : 
    8816           0 :         memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
    8817           0 :         perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
    8818             : }
    8819             : 
    8820             : struct perf_text_poke_event {
    8821             :         const void              *old_bytes;
    8822             :         const void              *new_bytes;
    8823             :         size_t                  pad;
    8824             :         u16                     old_len;
    8825             :         u16                     new_len;
    8826             : 
    8827             :         struct {
    8828             :                 struct perf_event_header        header;
    8829             : 
    8830             :                 u64                             addr;
    8831             :         } event_id;
    8832             : };
    8833             : 
    8834           0 : static int perf_event_text_poke_match(struct perf_event *event)
    8835             : {
    8836           0 :         return event->attr.text_poke;
    8837             : }
    8838             : 
    8839           0 : static void perf_event_text_poke_output(struct perf_event *event, void *data)
    8840             : {
    8841           0 :         struct perf_text_poke_event *text_poke_event = data;
    8842           0 :         struct perf_output_handle handle;
    8843           0 :         struct perf_sample_data sample;
    8844           0 :         u64 padding = 0;
    8845           0 :         int ret;
    8846             : 
    8847           0 :         if (!perf_event_text_poke_match(event))
    8848           0 :                 return;
    8849             : 
    8850           0 :         perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
    8851             : 
    8852           0 :         ret = perf_output_begin(&handle, &sample, event,
    8853           0 :                                 text_poke_event->event_id.header.size);
    8854           0 :         if (ret)
    8855             :                 return;
    8856             : 
    8857           0 :         perf_output_put(&handle, text_poke_event->event_id);
    8858           0 :         perf_output_put(&handle, text_poke_event->old_len);
    8859           0 :         perf_output_put(&handle, text_poke_event->new_len);
    8860             : 
    8861           0 :         __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
    8862           0 :         __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
    8863             : 
    8864           0 :         if (text_poke_event->pad)
    8865           0 :                 __output_copy(&handle, &padding, text_poke_event->pad);
    8866             : 
    8867           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8868             : 
    8869           0 :         perf_output_end(&handle);
    8870             : }
    8871             : 
    8872          38 : void perf_event_text_poke(const void *addr, const void *old_bytes,
    8873             :                           size_t old_len, const void *new_bytes, size_t new_len)
    8874             : {
    8875          38 :         struct perf_text_poke_event text_poke_event;
    8876          38 :         size_t tot, pad;
    8877             : 
    8878          38 :         if (!atomic_read(&nr_text_poke_events))
    8879          38 :                 return;
    8880             : 
    8881           0 :         tot  = sizeof(text_poke_event.old_len) + old_len;
    8882           0 :         tot += sizeof(text_poke_event.new_len) + new_len;
    8883           0 :         pad  = ALIGN(tot, sizeof(u64)) - tot;
    8884             : 
    8885           0 :         text_poke_event = (struct perf_text_poke_event){
    8886             :                 .old_bytes    = old_bytes,
    8887             :                 .new_bytes    = new_bytes,
    8888             :                 .pad          = pad,
    8889             :                 .old_len      = old_len,
    8890             :                 .new_len      = new_len,
    8891             :                 .event_id  = {
    8892             :                         .header = {
    8893             :                                 .type = PERF_RECORD_TEXT_POKE,
    8894             :                                 .misc = PERF_RECORD_MISC_KERNEL,
    8895           0 :                                 .size = sizeof(text_poke_event.event_id) + tot + pad,
    8896             :                         },
    8897           0 :                         .addr = (unsigned long)addr,
    8898             :                 },
    8899             :         };
    8900             : 
    8901           0 :         perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
    8902             : }
    8903             : 
    8904           0 : void perf_event_itrace_started(struct perf_event *event)
    8905             : {
    8906           0 :         event->attach_state |= PERF_ATTACH_ITRACE;
    8907           0 : }
    8908             : 
    8909           0 : static void perf_log_itrace_start(struct perf_event *event)
    8910             : {
    8911           0 :         struct perf_output_handle handle;
    8912           0 :         struct perf_sample_data sample;
    8913           0 :         struct perf_aux_event {
    8914             :                 struct perf_event_header        header;
    8915             :                 u32                             pid;
    8916             :                 u32                             tid;
    8917             :         } rec;
    8918           0 :         int ret;
    8919             : 
    8920           0 :         if (event->parent)
    8921           0 :                 event = event->parent;
    8922             : 
    8923           0 :         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
    8924           0 :             event->attach_state & PERF_ATTACH_ITRACE)
    8925           0 :                 return;
    8926             : 
    8927           0 :         rec.header.type = PERF_RECORD_ITRACE_START;
    8928           0 :         rec.header.misc = 0;
    8929           0 :         rec.header.size = sizeof(rec);
    8930           0 :         rec.pid = perf_event_pid(event, current);
    8931           0 :         rec.tid = perf_event_tid(event, current);
    8932             : 
    8933           0 :         perf_event_header__init_id(&rec.header, &sample, event);
    8934           0 :         ret = perf_output_begin(&handle, &sample, event, rec.header.size);
    8935             : 
    8936           0 :         if (ret)
    8937             :                 return;
    8938             : 
    8939           0 :         perf_output_put(&handle, rec);
    8940           0 :         perf_event__output_id_sample(event, &handle, &sample);
    8941             : 
    8942           0 :         perf_output_end(&handle);
    8943             : }
    8944             : 
    8945             : static int
    8946           0 : __perf_event_account_interrupt(struct perf_event *event, int throttle)
    8947             : {
    8948           0 :         struct hw_perf_event *hwc = &event->hw;
    8949           0 :         int ret = 0;
    8950           0 :         u64 seq;
    8951             : 
    8952           0 :         seq = __this_cpu_read(perf_throttled_seq);
    8953           0 :         if (seq != hwc->interrupts_seq) {
    8954           0 :                 hwc->interrupts_seq = seq;
    8955           0 :                 hwc->interrupts = 1;
    8956             :         } else {
    8957           0 :                 hwc->interrupts++;
    8958           0 :                 if (unlikely(throttle
    8959             :                              && hwc->interrupts >= max_samples_per_tick)) {
    8960           0 :                         __this_cpu_inc(perf_throttled_count);
    8961           0 :                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
    8962           0 :                         hwc->interrupts = MAX_INTERRUPTS;
    8963           0 :                         perf_log_throttle(event, 0);
    8964           0 :                         ret = 1;
    8965             :                 }
    8966             :         }
    8967             : 
    8968           0 :         if (event->attr.freq) {
    8969           0 :                 u64 now = perf_clock();
    8970           0 :                 s64 delta = now - hwc->freq_time_stamp;
    8971             : 
    8972           0 :                 hwc->freq_time_stamp = now;
    8973             : 
    8974           0 :                 if (delta > 0 && delta < 2*TICK_NSEC)
    8975           0 :                         perf_adjust_period(event, delta, hwc->last_period, true);
    8976             :         }
    8977             : 
    8978           0 :         return ret;
    8979             : }
    8980             : 
    8981           0 : int perf_event_account_interrupt(struct perf_event *event)
    8982             : {
    8983           0 :         return __perf_event_account_interrupt(event, 1);
    8984             : }
    8985             : 
    8986             : /*
    8987             :  * Generic event overflow handling, sampling.
    8988             :  */
    8989             : 
    8990           0 : static int __perf_event_overflow(struct perf_event *event,
    8991             :                                    int throttle, struct perf_sample_data *data,
    8992             :                                    struct pt_regs *regs)
    8993             : {
    8994           0 :         int events = atomic_read(&event->event_limit);
    8995           0 :         int ret = 0;
    8996             : 
    8997             :         /*
    8998             :          * Non-sampling counters might still use the PMI to fold short
    8999             :          * hardware counters, ignore those.
    9000             :          */
    9001           0 :         if (unlikely(!is_sampling_event(event)))
    9002             :                 return 0;
    9003             : 
    9004           0 :         ret = __perf_event_account_interrupt(event, throttle);
    9005             : 
    9006             :         /*
    9007             :          * XXX event_limit might not quite work as expected on inherited
    9008             :          * events
    9009             :          */
    9010             : 
    9011           0 :         event->pending_kill = POLL_IN;
    9012           0 :         if (events && atomic_dec_and_test(&event->event_limit)) {
    9013           0 :                 ret = 1;
    9014           0 :                 event->pending_kill = POLL_HUP;
    9015             : 
    9016           0 :                 perf_event_disable_inatomic(event);
    9017             :         }
    9018             : 
    9019           0 :         READ_ONCE(event->overflow_handler)(event, data, regs);
    9020             : 
    9021           0 :         if (*perf_event_fasync(event) && event->pending_kill) {
    9022           0 :                 event->pending_wakeup = 1;
    9023           0 :                 irq_work_queue(&event->pending);
    9024             :         }
    9025             : 
    9026             :         return ret;
    9027             : }
    9028             : 
    9029           0 : int perf_event_overflow(struct perf_event *event,
    9030             :                           struct perf_sample_data *data,
    9031             :                           struct pt_regs *regs)
    9032             : {
    9033           0 :         return __perf_event_overflow(event, 1, data, regs);
    9034             : }
    9035             : 
    9036             : /*
    9037             :  * Generic software event infrastructure
    9038             :  */
    9039             : 
    9040             : struct swevent_htable {
    9041             :         struct swevent_hlist            *swevent_hlist;
    9042             :         struct mutex                    hlist_mutex;
    9043             :         int                             hlist_refcount;
    9044             : 
    9045             :         /* Recursion avoidance in each contexts */
    9046             :         int                             recursion[PERF_NR_CONTEXTS];
    9047             : };
    9048             : 
    9049             : static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
    9050             : 
    9051             : /*
    9052             :  * We directly increment event->count and keep a second value in
    9053             :  * event->hw.period_left to count intervals. This period event
    9054             :  * is kept in the range [-sample_period, 0] so that we can use the
    9055             :  * sign as trigger.
    9056             :  */
    9057             : 
    9058           0 : u64 perf_swevent_set_period(struct perf_event *event)
    9059             : {
    9060           0 :         struct hw_perf_event *hwc = &event->hw;
    9061           0 :         u64 period = hwc->last_period;
    9062           0 :         u64 nr, offset;
    9063           0 :         s64 old, val;
    9064             : 
    9065           0 :         hwc->last_period = hwc->sample_period;
    9066             : 
    9067           0 : again:
    9068           0 :         old = val = local64_read(&hwc->period_left);
    9069           0 :         if (val < 0)
    9070             :                 return 0;
    9071             : 
    9072           0 :         nr = div64_u64(period + val, period);
    9073           0 :         offset = nr * period;
    9074           0 :         val -= offset;
    9075           0 :         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
    9076           0 :                 goto again;
    9077             : 
    9078             :         return nr;
    9079             : }
    9080             : 
    9081           0 : static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
    9082             :                                     struct perf_sample_data *data,
    9083             :                                     struct pt_regs *regs)
    9084             : {
    9085           0 :         struct hw_perf_event *hwc = &event->hw;
    9086           0 :         int throttle = 0;
    9087             : 
    9088           0 :         if (!overflow)
    9089           0 :                 overflow = perf_swevent_set_period(event);
    9090             : 
    9091           0 :         if (hwc->interrupts == MAX_INTERRUPTS)
    9092             :                 return;
    9093             : 
    9094           0 :         for (; overflow; overflow--) {
    9095           0 :                 if (__perf_event_overflow(event, throttle,
    9096             :                                             data, regs)) {
    9097             :                         /*
    9098             :                          * We inhibit the overflow from happening when
    9099             :                          * hwc->interrupts == MAX_INTERRUPTS.
    9100             :                          */
    9101             :                         break;
    9102             :                 }
    9103           0 :                 throttle = 1;
    9104             :         }
    9105             : }
    9106             : 
    9107           0 : static void perf_swevent_event(struct perf_event *event, u64 nr,
    9108             :                                struct perf_sample_data *data,
    9109             :                                struct pt_regs *regs)
    9110             : {
    9111           0 :         struct hw_perf_event *hwc = &event->hw;
    9112             : 
    9113           0 :         local64_add(nr, &event->count);
    9114             : 
    9115           0 :         if (!regs)
    9116             :                 return;
    9117             : 
    9118           0 :         if (!is_sampling_event(event))
    9119             :                 return;
    9120             : 
    9121           0 :         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
    9122           0 :                 data->period = nr;
    9123           0 :                 return perf_swevent_overflow(event, 1, data, regs);
    9124             :         } else
    9125           0 :                 data->period = event->hw.last_period;
    9126             : 
    9127           0 :         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
    9128           0 :                 return perf_swevent_overflow(event, 1, data, regs);
    9129             : 
    9130           0 :         if (local64_add_negative(nr, &hwc->period_left))
    9131             :                 return;
    9132             : 
    9133           0 :         perf_swevent_overflow(event, 0, data, regs);
    9134             : }
    9135             : 
    9136           0 : static int perf_exclude_event(struct perf_event *event,
    9137             :                               struct pt_regs *regs)
    9138             : {
    9139           0 :         if (event->hw.state & PERF_HES_STOPPED)
    9140             :                 return 1;
    9141             : 
    9142           0 :         if (regs) {
    9143           0 :                 if (event->attr.exclude_user && user_mode(regs))
    9144             :                         return 1;
    9145             : 
    9146           0 :                 if (event->attr.exclude_kernel && !user_mode(regs))
    9147           0 :                         return 1;
    9148             :         }
    9149             : 
    9150             :         return 0;
    9151             : }
    9152             : 
    9153           0 : static int perf_swevent_match(struct perf_event *event,
    9154             :                                 enum perf_type_id type,
    9155             :                                 u32 event_id,
    9156             :                                 struct perf_sample_data *data,
    9157             :                                 struct pt_regs *regs)
    9158             : {
    9159           0 :         if (event->attr.type != type)
    9160             :                 return 0;
    9161             : 
    9162           0 :         if (event->attr.config != event_id)
    9163             :                 return 0;
    9164             : 
    9165           0 :         if (perf_exclude_event(event, regs))
    9166           0 :                 return 0;
    9167             : 
    9168             :         return 1;
    9169             : }
    9170             : 
    9171           0 : static inline u64 swevent_hash(u64 type, u32 event_id)
    9172             : {
    9173           0 :         u64 val = event_id | (type << 32);
    9174             : 
    9175           0 :         return hash_64(val, SWEVENT_HLIST_BITS);
    9176             : }
    9177             : 
    9178             : static inline struct hlist_head *
    9179           0 : __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
    9180             : {
    9181           0 :         u64 hash = swevent_hash(type, event_id);
    9182             : 
    9183           0 :         return &hlist->heads[hash];
    9184             : }
    9185             : 
    9186             : /* For the read side: events when they trigger */
    9187             : static inline struct hlist_head *
    9188           0 : find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
    9189             : {
    9190           0 :         struct swevent_hlist *hlist;
    9191             : 
    9192           0 :         hlist = rcu_dereference(swhash->swevent_hlist);
    9193           0 :         if (!hlist)
    9194             :                 return NULL;
    9195             : 
    9196           0 :         return __find_swevent_head(hlist, type, event_id);
    9197             : }
    9198             : 
    9199             : /* For the event head insertion and removal in the hlist */
    9200             : static inline struct hlist_head *
    9201           0 : find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
    9202             : {
    9203           0 :         struct swevent_hlist *hlist;
    9204           0 :         u32 event_id = event->attr.config;
    9205           0 :         u64 type = event->attr.type;
    9206             : 
    9207             :         /*
    9208             :          * Event scheduling is always serialized against hlist allocation
    9209             :          * and release. Which makes the protected version suitable here.
    9210             :          * The context lock guarantees that.
    9211             :          */
    9212           0 :         hlist = rcu_dereference_protected(swhash->swevent_hlist,
    9213             :                                           lockdep_is_held(&event->ctx->lock));
    9214           0 :         if (!hlist)
    9215             :                 return NULL;
    9216             : 
    9217           0 :         return __find_swevent_head(hlist, type, event_id);
    9218             : }
    9219             : 
    9220           0 : static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
    9221             :                                     u64 nr,
    9222             :                                     struct perf_sample_data *data,
    9223             :                                     struct pt_regs *regs)
    9224             : {
    9225           0 :         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
    9226           0 :         struct perf_event *event;
    9227           0 :         struct hlist_head *head;
    9228             : 
    9229           0 :         rcu_read_lock();
    9230           0 :         head = find_swevent_head_rcu(swhash, type, event_id);
    9231           0 :         if (!head)
    9232           0 :                 goto end;
    9233             : 
    9234           0 :         hlist_for_each_entry_rcu(event, head, hlist_entry) {
    9235           0 :                 if (perf_swevent_match(event, type, event_id, data, regs))
    9236           0 :                         perf_swevent_event(event, nr, data, regs);
    9237             :         }
    9238           0 : end:
    9239           0 :         rcu_read_unlock();
    9240           0 : }
    9241             : 
    9242             : DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
    9243             : 
    9244           0 : int perf_swevent_get_recursion_context(void)
    9245             : {
    9246           0 :         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
    9247             : 
    9248           0 :         return get_recursion_context(swhash->recursion);
    9249             : }
    9250             : EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
    9251             : 
    9252           0 : void perf_swevent_put_recursion_context(int rctx)
    9253             : {
    9254           0 :         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
    9255             : 
    9256           0 :         put_recursion_context(swhash->recursion, rctx);
    9257           0 : }
    9258             : 
    9259           0 : void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
    9260             : {
    9261           0 :         struct perf_sample_data data;
    9262             : 
    9263           0 :         if (WARN_ON_ONCE(!regs))
    9264           0 :                 return;
    9265             : 
    9266           0 :         perf_sample_data_init(&data, addr, 0);
    9267           0 :         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
    9268             : }
    9269             : 
    9270           0 : void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
    9271             : {
    9272           0 :         int rctx;
    9273             : 
    9274           0 :         preempt_disable_notrace();
    9275           0 :         rctx = perf_swevent_get_recursion_context();
    9276           0 :         if (unlikely(rctx < 0))
    9277           0 :                 goto fail;
    9278             : 
    9279           0 :         ___perf_sw_event(event_id, nr, regs, addr);
    9280             : 
    9281           0 :         perf_swevent_put_recursion_context(rctx);
    9282           0 : fail:
    9283           0 :         preempt_enable_notrace();
    9284           0 : }
    9285             : 
    9286           0 : static void perf_swevent_read(struct perf_event *event)
    9287             : {
    9288           0 : }
    9289             : 
    9290           0 : static int perf_swevent_add(struct perf_event *event, int flags)
    9291             : {
    9292           0 :         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
    9293           0 :         struct hw_perf_event *hwc = &event->hw;
    9294           0 :         struct hlist_head *head;
    9295             : 
    9296           0 :         if (is_sampling_event(event)) {
    9297           0 :                 hwc->last_period = hwc->sample_period;
    9298           0 :                 perf_swevent_set_period(event);
    9299             :         }
    9300             : 
    9301           0 :         hwc->state = !(flags & PERF_EF_START);
    9302             : 
    9303           0 :         head = find_swevent_head(swhash, event);
    9304           0 :         if (WARN_ON_ONCE(!head))
    9305             :                 return -EINVAL;
    9306             : 
    9307           0 :         hlist_add_head_rcu(&event->hlist_entry, head);
    9308           0 :         perf_event_update_userpage(event);
    9309             : 
    9310           0 :         return 0;
    9311             : }
    9312             : 
    9313           0 : static void perf_swevent_del(struct perf_event *event, int flags)
    9314             : {
    9315           0 :         hlist_del_rcu(&event->hlist_entry);
    9316           0 : }
    9317             : 
    9318           0 : static void perf_swevent_start(struct perf_event *event, int flags)
    9319             : {
    9320           0 :         event->hw.state = 0;
    9321           0 : }
    9322             : 
    9323           0 : static void perf_swevent_stop(struct perf_event *event, int flags)
    9324             : {
    9325           0 :         event->hw.state = PERF_HES_STOPPED;
    9326           0 : }
    9327             : 
    9328             : /* Deref the hlist from the update side */
    9329             : static inline struct swevent_hlist *
    9330           0 : swevent_hlist_deref(struct swevent_htable *swhash)
    9331             : {
    9332           0 :         return rcu_dereference_protected(swhash->swevent_hlist,
    9333             :                                          lockdep_is_held(&swhash->hlist_mutex));
    9334             : }
    9335             : 
    9336           0 : static void swevent_hlist_release(struct swevent_htable *swhash)
    9337             : {
    9338           0 :         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
    9339             : 
    9340           0 :         if (!hlist)
    9341             :                 return;
    9342             : 
    9343           0 :         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
    9344           0 :         kfree_rcu(hlist, rcu_head);
    9345             : }
    9346             : 
    9347           0 : static void swevent_hlist_put_cpu(int cpu)
    9348             : {
    9349           0 :         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
    9350             : 
    9351           0 :         mutex_lock(&swhash->hlist_mutex);
    9352             : 
    9353           0 :         if (!--swhash->hlist_refcount)
    9354           0 :                 swevent_hlist_release(swhash);
    9355             : 
    9356           0 :         mutex_unlock(&swhash->hlist_mutex);
    9357           0 : }
    9358             : 
    9359           0 : static void swevent_hlist_put(void)
    9360             : {
    9361           0 :         int cpu;
    9362             : 
    9363           0 :         for_each_possible_cpu(cpu)
    9364           0 :                 swevent_hlist_put_cpu(cpu);
    9365           0 : }
    9366             : 
    9367           0 : static int swevent_hlist_get_cpu(int cpu)
    9368             : {
    9369           0 :         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
    9370           0 :         int err = 0;
    9371             : 
    9372           0 :         mutex_lock(&swhash->hlist_mutex);
    9373           0 :         if (!swevent_hlist_deref(swhash) &&
    9374           0 :             cpumask_test_cpu(cpu, perf_online_mask)) {
    9375           0 :                 struct swevent_hlist *hlist;
    9376             : 
    9377           0 :                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
    9378           0 :                 if (!hlist) {
    9379           0 :                         err = -ENOMEM;
    9380           0 :                         goto exit;
    9381             :                 }
    9382           0 :                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
    9383             :         }
    9384           0 :         swhash->hlist_refcount++;
    9385           0 : exit:
    9386           0 :         mutex_unlock(&swhash->hlist_mutex);
    9387             : 
    9388           0 :         return err;
    9389             : }
    9390             : 
    9391           0 : static int swevent_hlist_get(void)
    9392             : {
    9393           0 :         int err, cpu, failed_cpu;
    9394             : 
    9395           0 :         mutex_lock(&pmus_lock);
    9396           0 :         for_each_possible_cpu(cpu) {
    9397           0 :                 err = swevent_hlist_get_cpu(cpu);
    9398           0 :                 if (err) {
    9399           0 :                         failed_cpu = cpu;
    9400           0 :                         goto fail;
    9401             :                 }
    9402             :         }
    9403           0 :         mutex_unlock(&pmus_lock);
    9404           0 :         return 0;
    9405           0 : fail:
    9406           0 :         for_each_possible_cpu(cpu) {
    9407           0 :                 if (cpu == failed_cpu)
    9408             :                         break;
    9409           0 :                 swevent_hlist_put_cpu(cpu);
    9410             :         }
    9411           0 :         mutex_unlock(&pmus_lock);
    9412           0 :         return err;
    9413             : }
    9414             : 
    9415             : struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
    9416             : 
    9417           0 : static void sw_perf_event_destroy(struct perf_event *event)
    9418             : {
    9419           0 :         u64 event_id = event->attr.config;
    9420             : 
    9421           0 :         WARN_ON(event->parent);
    9422             : 
    9423           0 :         static_key_slow_dec(&perf_swevent_enabled[event_id]);
    9424           0 :         swevent_hlist_put();
    9425           0 : }
    9426             : 
    9427           0 : static int perf_swevent_init(struct perf_event *event)
    9428             : {
    9429           0 :         u64 event_id = event->attr.config;
    9430             : 
    9431           0 :         if (event->attr.type != PERF_TYPE_SOFTWARE)
    9432             :                 return -ENOENT;
    9433             : 
    9434             :         /*
    9435             :          * no branch sampling for software events
    9436             :          */
    9437           0 :         if (has_branch_stack(event))
    9438             :                 return -EOPNOTSUPP;
    9439             : 
    9440           0 :         switch (event_id) {
    9441             :         case PERF_COUNT_SW_CPU_CLOCK:
    9442             :         case PERF_COUNT_SW_TASK_CLOCK:
    9443             :                 return -ENOENT;
    9444             : 
    9445             :         default:
    9446           0 :                 break;
    9447             :         }
    9448             : 
    9449           0 :         if (event_id >= PERF_COUNT_SW_MAX)
    9450             :                 return -ENOENT;
    9451             : 
    9452           0 :         if (!event->parent) {
    9453           0 :                 int err;
    9454             : 
    9455           0 :                 err = swevent_hlist_get();
    9456           0 :                 if (err)
    9457             :                         return err;
    9458             : 
    9459           0 :                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
    9460           0 :                 event->destroy = sw_perf_event_destroy;
    9461             :         }
    9462             : 
    9463             :         return 0;
    9464             : }
    9465             : 
    9466             : static struct pmu perf_swevent = {
    9467             :         .task_ctx_nr    = perf_sw_context,
    9468             : 
    9469             :         .capabilities   = PERF_PMU_CAP_NO_NMI,
    9470             : 
    9471             :         .event_init     = perf_swevent_init,
    9472             :         .add            = perf_swevent_add,
    9473             :         .del            = perf_swevent_del,
    9474             :         .start          = perf_swevent_start,
    9475             :         .stop           = perf_swevent_stop,
    9476             :         .read           = perf_swevent_read,
    9477             : };
    9478             : 
    9479             : #ifdef CONFIG_EVENT_TRACING
    9480             : 
    9481           0 : static int perf_tp_filter_match(struct perf_event *event,
    9482             :                                 struct perf_sample_data *data)
    9483             : {
    9484           0 :         void *record = data->raw->frag.data;
    9485             : 
    9486             :         /* only top level events have filters set */
    9487           0 :         if (event->parent)
    9488           0 :                 event = event->parent;
    9489             : 
    9490           0 :         if (likely(!event->filter) || filter_match_preds(event->filter, record))
    9491           0 :                 return 1;
    9492             :         return 0;
    9493             : }
    9494             : 
    9495           0 : static int perf_tp_event_match(struct perf_event *event,
    9496             :                                 struct perf_sample_data *data,
    9497             :                                 struct pt_regs *regs)
    9498             : {
    9499           0 :         if (event->hw.state & PERF_HES_STOPPED)
    9500             :                 return 0;
    9501             :         /*
    9502             :          * If exclude_kernel, only trace user-space tracepoints (uprobes)
    9503             :          */
    9504           0 :         if (event->attr.exclude_kernel && !user_mode(regs))
    9505             :                 return 0;
    9506             : 
    9507           0 :         if (!perf_tp_filter_match(event, data))
    9508           0 :                 return 0;
    9509             : 
    9510             :         return 1;
    9511             : }
    9512             : 
    9513           0 : void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
    9514             :                                struct trace_event_call *call, u64 count,
    9515             :                                struct pt_regs *regs, struct hlist_head *head,
    9516             :                                struct task_struct *task)
    9517             : {
    9518           0 :         if (bpf_prog_array_valid(call)) {
    9519           0 :                 *(struct pt_regs **)raw_data = regs;
    9520           0 :                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
    9521           0 :                         perf_swevent_put_recursion_context(rctx);
    9522           0 :                         return;
    9523             :                 }
    9524             :         }
    9525           0 :         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
    9526             :                       rctx, task);
    9527             : }
    9528             : EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
    9529             : 
    9530           0 : void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
    9531             :                    struct pt_regs *regs, struct hlist_head *head, int rctx,
    9532             :                    struct task_struct *task)
    9533             : {
    9534           0 :         struct perf_sample_data data;
    9535           0 :         struct perf_event *event;
    9536             : 
    9537           0 :         struct perf_raw_record raw = {
    9538             :                 .frag = {
    9539             :                         .size = entry_size,
    9540             :                         .data = record,
    9541             :                 },
    9542             :         };
    9543             : 
    9544           0 :         perf_sample_data_init(&data, 0, 0);
    9545           0 :         data.raw = &raw;
    9546             : 
    9547           0 :         perf_trace_buf_update(record, event_type);
    9548             : 
    9549           0 :         hlist_for_each_entry_rcu(event, head, hlist_entry) {
    9550           0 :                 if (perf_tp_event_match(event, &data, regs))
    9551           0 :                         perf_swevent_event(event, count, &data, regs);
    9552             :         }
    9553             : 
    9554             :         /*
    9555             :          * If we got specified a target task, also iterate its context and
    9556             :          * deliver this event there too.
    9557             :          */
    9558           0 :         if (task && task != current) {
    9559           0 :                 struct perf_event_context *ctx;
    9560           0 :                 struct trace_entry *entry = record;
    9561             : 
    9562           0 :                 rcu_read_lock();
    9563           0 :                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
    9564           0 :                 if (!ctx)
    9565           0 :                         goto unlock;
    9566             : 
    9567           0 :                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
    9568           0 :                         if (event->cpu != smp_processor_id())
    9569           0 :                                 continue;
    9570           0 :                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
    9571           0 :                                 continue;
    9572           0 :                         if (event->attr.config != entry->type)
    9573           0 :                                 continue;
    9574           0 :                         if (perf_tp_event_match(event, &data, regs))
    9575           0 :                                 perf_swevent_event(event, count, &data, regs);
    9576             :                 }
    9577           0 : unlock:
    9578           0 :                 rcu_read_unlock();
    9579             :         }
    9580             : 
    9581           0 :         perf_swevent_put_recursion_context(rctx);
    9582           0 : }
    9583             : EXPORT_SYMBOL_GPL(perf_tp_event);
    9584             : 
    9585           0 : static void tp_perf_event_destroy(struct perf_event *event)
    9586             : {
    9587           0 :         perf_trace_destroy(event);
    9588           0 : }
    9589             : 
    9590           0 : static int perf_tp_event_init(struct perf_event *event)
    9591             : {
    9592           0 :         int err;
    9593             : 
    9594           0 :         if (event->attr.type != PERF_TYPE_TRACEPOINT)
    9595             :                 return -ENOENT;
    9596             : 
    9597             :         /*
    9598             :          * no branch sampling for tracepoint events
    9599             :          */
    9600           0 :         if (has_branch_stack(event))
    9601             :                 return -EOPNOTSUPP;
    9602             : 
    9603           0 :         err = perf_trace_init(event);
    9604           0 :         if (err)
    9605             :                 return err;
    9606             : 
    9607           0 :         event->destroy = tp_perf_event_destroy;
    9608             : 
    9609           0 :         return 0;
    9610             : }
    9611             : 
    9612             : static struct pmu perf_tracepoint = {
    9613             :         .task_ctx_nr    = perf_sw_context,
    9614             : 
    9615             :         .event_init     = perf_tp_event_init,
    9616             :         .add            = perf_trace_add,
    9617             :         .del            = perf_trace_del,
    9618             :         .start          = perf_swevent_start,
    9619             :         .stop           = perf_swevent_stop,
    9620             :         .read           = perf_swevent_read,
    9621             : };
    9622             : 
    9623             : #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
    9624             : /*
    9625             :  * Flags in config, used by dynamic PMU kprobe and uprobe
    9626             :  * The flags should match following PMU_FORMAT_ATTR().
    9627             :  *
    9628             :  * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
    9629             :  *                               if not set, create kprobe/uprobe
    9630             :  *
    9631             :  * The following values specify a reference counter (or semaphore in the
    9632             :  * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
    9633             :  * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
    9634             :  *
    9635             :  * PERF_UPROBE_REF_CTR_OFFSET_BITS      # of bits in config as th offset
    9636             :  * PERF_UPROBE_REF_CTR_OFFSET_SHIFT     # of bits to shift left
    9637             :  */
    9638             : enum perf_probe_config {
    9639             :         PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
    9640             :         PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
    9641             :         PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
    9642             : };
    9643             : 
    9644             : PMU_FORMAT_ATTR(retprobe, "config:0");
    9645             : #endif
    9646             : 
    9647             : #ifdef CONFIG_KPROBE_EVENTS
    9648             : static struct attribute *kprobe_attrs[] = {
    9649             :         &format_attr_retprobe.attr,
    9650             :         NULL,
    9651             : };
    9652             : 
    9653             : static struct attribute_group kprobe_format_group = {
    9654             :         .name = "format",
    9655             :         .attrs = kprobe_attrs,
    9656             : };
    9657             : 
    9658             : static const struct attribute_group *kprobe_attr_groups[] = {
    9659             :         &kprobe_format_group,
    9660             :         NULL,
    9661             : };
    9662             : 
    9663             : static int perf_kprobe_event_init(struct perf_event *event);
    9664             : static struct pmu perf_kprobe = {
    9665             :         .task_ctx_nr    = perf_sw_context,
    9666             :         .event_init     = perf_kprobe_event_init,
    9667             :         .add            = perf_trace_add,
    9668             :         .del            = perf_trace_del,
    9669             :         .start          = perf_swevent_start,
    9670             :         .stop           = perf_swevent_stop,
    9671             :         .read           = perf_swevent_read,
    9672             :         .attr_groups    = kprobe_attr_groups,
    9673             : };
    9674             : 
    9675             : static int perf_kprobe_event_init(struct perf_event *event)
    9676             : {
    9677             :         int err;
    9678             :         bool is_retprobe;
    9679             : 
    9680             :         if (event->attr.type != perf_kprobe.type)
    9681             :                 return -ENOENT;
    9682             : 
    9683             :         if (!perfmon_capable())
    9684             :                 return -EACCES;
    9685             : 
    9686             :         /*
    9687             :          * no branch sampling for probe events
    9688             :          */
    9689             :         if (has_branch_stack(event))
    9690             :                 return -EOPNOTSUPP;
    9691             : 
    9692             :         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
    9693             :         err = perf_kprobe_init(event, is_retprobe);
    9694             :         if (err)
    9695             :                 return err;
    9696             : 
    9697             :         event->destroy = perf_kprobe_destroy;
    9698             : 
    9699             :         return 0;
    9700             : }
    9701             : #endif /* CONFIG_KPROBE_EVENTS */
    9702             : 
    9703             : #ifdef CONFIG_UPROBE_EVENTS
    9704             : PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");
    9705             : 
    9706             : static struct attribute *uprobe_attrs[] = {
    9707             :         &format_attr_retprobe.attr,
    9708             :         &format_attr_ref_ctr_offset.attr,
    9709             :         NULL,
    9710             : };
    9711             : 
    9712             : static struct attribute_group uprobe_format_group = {
    9713             :         .name = "format",
    9714             :         .attrs = uprobe_attrs,
    9715             : };
    9716             : 
    9717             : static const struct attribute_group *uprobe_attr_groups[] = {
    9718             :         &uprobe_format_group,
    9719             :         NULL,
    9720             : };
    9721             : 
    9722             : static int perf_uprobe_event_init(struct perf_event *event);
    9723             : static struct pmu perf_uprobe = {
    9724             :         .task_ctx_nr    = perf_sw_context,
    9725             :         .event_init     = perf_uprobe_event_init,
    9726             :         .add            = perf_trace_add,
    9727             :         .del            = perf_trace_del,
    9728             :         .start          = perf_swevent_start,
    9729             :         .stop           = perf_swevent_stop,
    9730             :         .read           = perf_swevent_read,
    9731             :         .attr_groups    = uprobe_attr_groups,
    9732             : };
    9733             : 
    9734             : static int perf_uprobe_event_init(struct perf_event *event)
    9735             : {
    9736             :         int err;
    9737             :         unsigned long ref_ctr_offset;
    9738             :         bool is_retprobe;
    9739             : 
    9740             :         if (event->attr.type != perf_uprobe.type)
    9741             :                 return -ENOENT;
    9742             : 
    9743             :         if (!perfmon_capable())
    9744             :                 return -EACCES;
    9745             : 
    9746             :         /*
    9747             :          * no branch sampling for probe events
    9748             :          */
    9749             :         if (has_branch_stack(event))
    9750             :                 return -EOPNOTSUPP;
    9751             : 
    9752             :         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
    9753             :         ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
    9754             :         err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
    9755             :         if (err)
    9756             :                 return err;
    9757             : 
    9758             :         event->destroy = perf_uprobe_destroy;
    9759             : 
    9760             :         return 0;
    9761             : }
    9762             : #endif /* CONFIG_UPROBE_EVENTS */
    9763             : 
    9764           1 : static inline void perf_tp_register(void)
    9765             : {
    9766           1 :         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
    9767             : #ifdef CONFIG_KPROBE_EVENTS
    9768             :         perf_pmu_register(&perf_kprobe, "kprobe", -1);
    9769             : #endif
    9770             : #ifdef CONFIG_UPROBE_EVENTS
    9771             :         perf_pmu_register(&perf_uprobe, "uprobe", -1);
    9772             : #endif
    9773             : }
    9774             : 
    9775           0 : static void perf_event_free_filter(struct perf_event *event)
    9776             : {
    9777           0 :         ftrace_profile_free_filter(event);
    9778             : }
    9779             : 
    9780             : #ifdef CONFIG_BPF_SYSCALL
    9781             : static void bpf_overflow_handler(struct perf_event *event,
    9782             :                                  struct perf_sample_data *data,
    9783             :                                  struct pt_regs *regs)
    9784             : {
    9785             :         struct bpf_perf_event_data_kern ctx = {
    9786             :                 .data = data,
    9787             :                 .event = event,
    9788             :         };
    9789             :         int ret = 0;
    9790             : 
    9791             :         ctx.regs = perf_arch_bpf_user_pt_regs(regs);
    9792             :         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
    9793             :                 goto out;
    9794             :         rcu_read_lock();
    9795             :         ret = BPF_PROG_RUN(event->prog, &ctx);
    9796             :         rcu_read_unlock();
    9797             : out:
    9798             :         __this_cpu_dec(bpf_prog_active);
    9799             :         if (!ret)
    9800             :                 return;
    9801             : 
    9802             :         event->orig_overflow_handler(event, data, regs);
    9803             : }
    9804             : 
    9805             : static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
    9806             : {
    9807             :         struct bpf_prog *prog;
    9808             : 
    9809             :         if (event->overflow_handler_context)
    9810             :                 /* hw breakpoint or kernel counter */
    9811             :                 return -EINVAL;
    9812             : 
    9813             :         if (event->prog)
    9814             :                 return -EEXIST;
    9815             : 
    9816             :         prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
    9817             :         if (IS_ERR(prog))
    9818             :                 return PTR_ERR(prog);
    9819             : 
    9820             :         if (event->attr.precise_ip &&
    9821             :             prog->call_get_stack &&
    9822             :             (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
    9823             :              event->attr.exclude_callchain_kernel ||
    9824             :              event->attr.exclude_callchain_user)) {
    9825             :                 /*
    9826             :                  * On perf_event with precise_ip, calling bpf_get_stack()
    9827             :                  * may trigger unwinder warnings and occasional crashes.
    9828             :                  * bpf_get_[stack|stackid] works around this issue by using
    9829             :                  * callchain attached to perf_sample_data. If the
    9830             :                  * perf_event does not full (kernel and user) callchain
    9831             :                  * attached to perf_sample_data, do not allow attaching BPF
    9832             :                  * program that calls bpf_get_[stack|stackid].
    9833             :                  */
    9834             :                 bpf_prog_put(prog);
    9835             :                 return -EPROTO;
    9836             :         }
    9837             : 
    9838             :         event->prog = prog;
    9839             :         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
    9840             :         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
    9841             :         return 0;
    9842             : }
    9843             : 
    9844             : static void perf_event_free_bpf_handler(struct perf_event *event)
    9845             : {
    9846             :         struct bpf_prog *prog = event->prog;
    9847             : 
    9848             :         if (!prog)
    9849             :                 return;
    9850             : 
    9851             :         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
    9852             :         event->prog = NULL;
    9853             :         bpf_prog_put(prog);
    9854             : }
    9855             : #else
    9856             : static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
    9857             : {
    9858             :         return -EOPNOTSUPP;
    9859             : }
    9860             : static void perf_event_free_bpf_handler(struct perf_event *event)
    9861             : {
    9862             : }
    9863             : #endif
    9864             : 
    9865             : /*
    9866             :  * returns true if the event is a tracepoint, or a kprobe/upprobe created
    9867             :  * with perf_event_open()
    9868             :  */
    9869           0 : static inline bool perf_event_is_tracing(struct perf_event *event)
    9870             : {
    9871           0 :         if (event->pmu == &perf_tracepoint)
    9872           0 :                 return true;
    9873             : #ifdef CONFIG_KPROBE_EVENTS
    9874             :         if (event->pmu == &perf_kprobe)
    9875             :                 return true;
    9876             : #endif
    9877             : #ifdef CONFIG_UPROBE_EVENTS
    9878             :         if (event->pmu == &perf_uprobe)
    9879             :                 return true;
    9880             : #endif
    9881             :         return false;
    9882             : }
    9883             : 
    9884           0 : static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
    9885             : {
    9886           0 :         bool is_kprobe, is_tracepoint, is_syscall_tp;
    9887           0 :         struct bpf_prog *prog;
    9888           0 :         int ret;
    9889             : 
    9890           0 :         if (!perf_event_is_tracing(event))
    9891           0 :                 return perf_event_set_bpf_handler(event, prog_fd);
    9892             : 
    9893           0 :         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
    9894           0 :         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
    9895           0 :         is_syscall_tp = is_syscall_trace_event(event->tp_event);
    9896           0 :         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
    9897             :                 /* bpf programs can only be attached to u/kprobe or tracepoint */
    9898             :                 return -EINVAL;
    9899             : 
    9900           0 :         prog = bpf_prog_get(prog_fd);
    9901           0 :         if (IS_ERR(prog))
    9902           0 :                 return PTR_ERR(prog);
    9903             : 
    9904             :         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
    9905             :             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
    9906             :             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
    9907             :                 /* valid fd, but invalid bpf program type */
    9908             :                 bpf_prog_put(prog);
    9909             :                 return -EINVAL;
    9910             :         }
    9911             : 
    9912             :         /* Kprobe override only works for kprobes, not uprobes. */
    9913             :         if (prog->kprobe_override &&
    9914             :             !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
    9915             :                 bpf_prog_put(prog);
    9916             :                 return -EINVAL;
    9917             :         }
    9918             : 
    9919             :         if (is_tracepoint || is_syscall_tp) {
    9920             :                 int off = trace_event_get_offsets(event->tp_event);
    9921             : 
    9922             :                 if (prog->aux->max_ctx_offset > off) {
    9923             :                         bpf_prog_put(prog);
    9924             :                         return -EACCES;
    9925             :                 }
    9926             :         }
    9927             : 
    9928             :         ret = perf_event_attach_bpf_prog(event, prog);
    9929             :         if (ret)
    9930             :                 bpf_prog_put(prog);
    9931             :         return ret;
    9932             : }
    9933             : 
    9934           0 : static void perf_event_free_bpf_prog(struct perf_event *event)
    9935             : {
    9936           0 :         if (!perf_event_is_tracing(event)) {
    9937             :                 perf_event_free_bpf_handler(event);
    9938             :                 return;
    9939             :         }
    9940           0 :         perf_event_detach_bpf_prog(event);
    9941             : }
    9942             : 
    9943             : #else
    9944             : 
    9945             : static inline void perf_tp_register(void)
    9946             : {
    9947             : }
    9948             : 
    9949             : static void perf_event_free_filter(struct perf_event *event)
    9950             : {
    9951             : }
    9952             : 
    9953             : static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
    9954             : {
    9955             :         return -ENOENT;
    9956             : }
    9957             : 
    9958             : static void perf_event_free_bpf_prog(struct perf_event *event)
    9959             : {
    9960             : }
    9961             : #endif /* CONFIG_EVENT_TRACING */
    9962             : 
    9963             : #ifdef CONFIG_HAVE_HW_BREAKPOINT
    9964           0 : void perf_bp_event(struct perf_event *bp, void *data)
    9965             : {
    9966           0 :         struct perf_sample_data sample;
    9967           0 :         struct pt_regs *regs = data;
    9968             : 
    9969           0 :         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
    9970             : 
    9971           0 :         if (!bp->hw.state && !perf_exclude_event(bp, regs))
    9972           0 :                 perf_swevent_event(bp, 1, &sample, regs);
    9973           0 : }
    9974             : #endif
    9975             : 
    9976             : /*
    9977             :  * Allocate a new address filter
    9978             :  */
    9979             : static struct perf_addr_filter *
    9980           0 : perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
    9981             : {
    9982           0 :         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
    9983           0 :         struct perf_addr_filter *filter;
    9984             : 
    9985           0 :         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
    9986           0 :         if (!filter)
    9987             :                 return NULL;
    9988             : 
    9989           0 :         INIT_LIST_HEAD(&filter->entry);
    9990           0 :         list_add_tail(&filter->entry, filters);
    9991             : 
    9992           0 :         return filter;
    9993             : }
    9994             : 
    9995           0 : static void free_filters_list(struct list_head *filters)
    9996             : {
    9997           0 :         struct perf_addr_filter *filter, *iter;
    9998             : 
    9999           0 :         list_for_each_entry_safe(filter, iter, filters, entry) {
   10000           0 :                 path_put(&filter->path);
   10001           0 :                 list_del(&filter->entry);
   10002           0 :                 kfree(filter);
   10003             :         }
   10004           0 : }
   10005             : 
   10006             : /*
   10007             :  * Free existing address filters and optionally install new ones
   10008             :  */
   10009           0 : static void perf_addr_filters_splice(struct perf_event *event,
   10010             :                                      struct list_head *head)
   10011             : {
   10012           0 :         unsigned long flags;
   10013           0 :         LIST_HEAD(list);
   10014             : 
   10015           0 :         if (!has_addr_filter(event))
   10016           0 :                 return;
   10017             : 
   10018             :         /* don't bother with children, they don't have their own filters */
   10019           0 :         if (event->parent)
   10020             :                 return;
   10021             : 
   10022           0 :         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
   10023             : 
   10024           0 :         list_splice_init(&event->addr_filters.list, &list);
   10025           0 :         if (head)
   10026           0 :                 list_splice(head, &event->addr_filters.list);
   10027             : 
   10028           0 :         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
   10029             : 
   10030           0 :         free_filters_list(&list);
   10031             : }
   10032             : 
   10033             : /*
   10034             :  * Scan through mm's vmas and see if one of them matches the
   10035             :  * @filter; if so, adjust filter's address range.
   10036             :  * Called with mm::mmap_lock down for reading.
   10037             :  */
   10038           0 : static void perf_addr_filter_apply(struct perf_addr_filter *filter,
   10039             :                                    struct mm_struct *mm,
   10040             :                                    struct perf_addr_filter_range *fr)
   10041             : {
   10042           0 :         struct vm_area_struct *vma;
   10043             : 
   10044           0 :         for (vma = mm->mmap; vma; vma = vma->vm_next) {
   10045           0 :                 if (!vma->vm_file)
   10046           0 :                         continue;
   10047             : 
   10048           0 :                 if (perf_addr_filter_vma_adjust(filter, vma, fr))
   10049             :                         return;
   10050             :         }
   10051             : }
   10052             : 
   10053             : /*
   10054             :  * Update event's address range filters based on the
   10055             :  * task's existing mappings, if any.
   10056             :  */
   10057           0 : static void perf_event_addr_filters_apply(struct perf_event *event)
   10058             : {
   10059           0 :         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
   10060           0 :         struct task_struct *task = READ_ONCE(event->ctx->task);
   10061           0 :         struct perf_addr_filter *filter;
   10062           0 :         struct mm_struct *mm = NULL;
   10063           0 :         unsigned int count = 0;
   10064           0 :         unsigned long flags;
   10065             : 
   10066             :         /*
   10067             :          * We may observe TASK_TOMBSTONE, which means that the event tear-down
   10068             :          * will stop on the parent's child_mutex that our caller is also holding
   10069             :          */
   10070           0 :         if (task == TASK_TOMBSTONE)
   10071             :                 return;
   10072             : 
   10073           0 :         if (ifh->nr_file_filters) {
   10074           0 :                 mm = get_task_mm(event->ctx->task);
   10075           0 :                 if (!mm)
   10076           0 :                         goto restart;
   10077             : 
   10078           0 :                 mmap_read_lock(mm);
   10079             :         }
   10080             : 
   10081           0 :         raw_spin_lock_irqsave(&ifh->lock, flags);
   10082           0 :         list_for_each_entry(filter, &ifh->list, entry) {
   10083           0 :                 if (filter->path.dentry) {
   10084             :                         /*
   10085             :                          * Adjust base offset if the filter is associated to a
   10086             :                          * binary that needs to be mapped:
   10087             :                          */
   10088           0 :                         event->addr_filter_ranges[count].start = 0;
   10089           0 :                         event->addr_filter_ranges[count].size = 0;
   10090             : 
   10091           0 :                         perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
   10092             :                 } else {
   10093           0 :                         event->addr_filter_ranges[count].start = filter->offset;
   10094           0 :                         event->addr_filter_ranges[count].size  = filter->size;
   10095             :                 }
   10096             : 
   10097           0 :                 count++;
   10098             :         }
   10099             : 
   10100           0 :         event->addr_filters_gen++;
   10101           0 :         raw_spin_unlock_irqrestore(&ifh->lock, flags);
   10102             : 
   10103           0 :         if (ifh->nr_file_filters) {
   10104           0 :                 mmap_read_unlock(mm);
   10105             : 
   10106           0 :                 mmput(mm);
   10107             :         }
   10108             : 
   10109           0 : restart:
   10110           0 :         perf_event_stop(event, 1);
   10111             : }
   10112             : 
   10113             : /*
   10114             :  * Address range filtering: limiting the data to certain
   10115             :  * instruction address ranges. Filters are ioctl()ed to us from
   10116             :  * userspace as ascii strings.
   10117             :  *
   10118             :  * Filter string format:
   10119             :  *
   10120             :  * ACTION RANGE_SPEC
   10121             :  * where ACTION is one of the
   10122             :  *  * "filter": limit the trace to this region
   10123             :  *  * "start": start tracing from this address
   10124             :  *  * "stop": stop tracing at this address/region;
   10125             :  * RANGE_SPEC is
   10126             :  *  * for kernel addresses: <start address>[/<size>]
   10127             :  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
   10128             :  *
   10129             :  * if <size> is not specified or is zero, the range is treated as a single
   10130             :  * address; not valid for ACTION=="filter".
   10131             :  */
   10132             : enum {
   10133             :         IF_ACT_NONE = -1,
   10134             :         IF_ACT_FILTER,
   10135             :         IF_ACT_START,
   10136             :         IF_ACT_STOP,
   10137             :         IF_SRC_FILE,
   10138             :         IF_SRC_KERNEL,
   10139             :         IF_SRC_FILEADDR,
   10140             :         IF_SRC_KERNELADDR,
   10141             : };
   10142             : 
   10143             : enum {
   10144             :         IF_STATE_ACTION = 0,
   10145             :         IF_STATE_SOURCE,
   10146             :         IF_STATE_END,
   10147             : };
   10148             : 
   10149             : static const match_table_t if_tokens = {
   10150             :         { IF_ACT_FILTER,        "filter" },
   10151             :         { IF_ACT_START,         "start" },
   10152             :         { IF_ACT_STOP,          "stop" },
   10153             :         { IF_SRC_FILE,          "%u/%u@%s" },
   10154             :         { IF_SRC_KERNEL,        "%u/%u" },
   10155             :         { IF_SRC_FILEADDR,      "%u@%s" },
   10156             :         { IF_SRC_KERNELADDR,    "%u" },
   10157             :         { IF_ACT_NONE,          NULL },
   10158             : };
   10159             : 
   10160             : /*
   10161             :  * Address filter string parser
   10162             :  */
   10163             : static int
   10164           0 : perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
   10165             :                              struct list_head *filters)
   10166             : {
   10167           0 :         struct perf_addr_filter *filter = NULL;
   10168           0 :         char *start, *orig, *filename = NULL;
   10169           0 :         substring_t args[MAX_OPT_ARGS];
   10170           0 :         int state = IF_STATE_ACTION, token;
   10171           0 :         unsigned int kernel = 0;
   10172           0 :         int ret = -EINVAL;
   10173             : 
   10174           0 :         orig = fstr = kstrdup(fstr, GFP_KERNEL);
   10175           0 :         if (!fstr)
   10176             :                 return -ENOMEM;
   10177             : 
   10178           0 :         while ((start = strsep(&fstr, " ,\n")) != NULL) {
   10179           0 :                 static const enum perf_addr_filter_action_t actions[] = {
   10180             :                         [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
   10181             :                         [IF_ACT_START]  = PERF_ADDR_FILTER_ACTION_START,
   10182             :                         [IF_ACT_STOP]   = PERF_ADDR_FILTER_ACTION_STOP,
   10183             :                 };
   10184           0 :                 ret = -EINVAL;
   10185             : 
   10186           0 :                 if (!*start)
   10187           0 :                         continue;
   10188             : 
   10189             :                 /* filter definition begins */
   10190           0 :                 if (state == IF_STATE_ACTION) {
   10191           0 :                         filter = perf_addr_filter_new(event, filters);
   10192           0 :                         if (!filter)
   10193           0 :                                 goto fail;
   10194             :                 }
   10195             : 
   10196           0 :                 token = match_token(start, if_tokens, args);
   10197           0 :                 switch (token) {
   10198           0 :                 case IF_ACT_FILTER:
   10199             :                 case IF_ACT_START:
   10200             :                 case IF_ACT_STOP:
   10201           0 :                         if (state != IF_STATE_ACTION)
   10202           0 :                                 goto fail;
   10203             : 
   10204           0 :                         filter->action = actions[token];
   10205           0 :                         state = IF_STATE_SOURCE;
   10206           0 :                         break;
   10207             : 
   10208           0 :                 case IF_SRC_KERNELADDR:
   10209             :                 case IF_SRC_KERNEL:
   10210           0 :                         kernel = 1;
   10211           0 :                         fallthrough;
   10212             : 
   10213           0 :                 case IF_SRC_FILEADDR:
   10214             :                 case IF_SRC_FILE:
   10215           0 :                         if (state != IF_STATE_SOURCE)
   10216           0 :                                 goto fail;
   10217             : 
   10218           0 :                         *args[0].to = 0;
   10219           0 :                         ret = kstrtoul(args[0].from, 0, &filter->offset);
   10220           0 :                         if (ret)
   10221           0 :                                 goto fail;
   10222             : 
   10223           0 :                         if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
   10224           0 :                                 *args[1].to = 0;
   10225           0 :                                 ret = kstrtoul(args[1].from, 0, &filter->size);
   10226           0 :                                 if (ret)
   10227           0 :                                         goto fail;
   10228             :                         }
   10229             : 
   10230           0 :                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
   10231           0 :                                 int fpos = token == IF_SRC_FILE ? 2 : 1;
   10232             : 
   10233           0 :                                 kfree(filename);
   10234           0 :                                 filename = match_strdup(&args[fpos]);
   10235           0 :                                 if (!filename) {
   10236           0 :                                         ret = -ENOMEM;
   10237           0 :                                         goto fail;
   10238             :                                 }
   10239             :                         }
   10240             : 
   10241             :                         state = IF_STATE_END;
   10242             :                         break;
   10243             : 
   10244           0 :                 default:
   10245           0 :                         goto fail;
   10246             :                 }
   10247             : 
   10248             :                 /*
   10249             :                  * Filter definition is fully parsed, validate and install it.
   10250             :                  * Make sure that it doesn't contradict itself or the event's
   10251             :                  * attribute.
   10252             :                  */
   10253           0 :                 if (state == IF_STATE_END) {
   10254           0 :                         ret = -EINVAL;
   10255           0 :                         if (kernel && event->attr.exclude_kernel)
   10256           0 :                                 goto fail;
   10257             : 
   10258             :                         /*
   10259             :                          * ACTION "filter" must have a non-zero length region
   10260             :                          * specified.
   10261             :                          */
   10262           0 :                         if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
   10263           0 :                             !filter->size)
   10264           0 :                                 goto fail;
   10265             : 
   10266           0 :                         if (!kernel) {
   10267           0 :                                 if (!filename)
   10268           0 :                                         goto fail;
   10269             : 
   10270             :                                 /*
   10271             :                                  * For now, we only support file-based filters
   10272             :                                  * in per-task events; doing so for CPU-wide
   10273             :                                  * events requires additional context switching
   10274             :                                  * trickery, since same object code will be
   10275             :                                  * mapped at different virtual addresses in
   10276             :                                  * different processes.
   10277             :                                  */
   10278           0 :                                 ret = -EOPNOTSUPP;
   10279           0 :                                 if (!event->ctx->task)
   10280           0 :                                         goto fail;
   10281             : 
   10282             :                                 /* look up the path and grab its inode */
   10283           0 :                                 ret = kern_path(filename, LOOKUP_FOLLOW,
   10284             :                                                 &filter->path);
   10285           0 :                                 if (ret)
   10286           0 :                                         goto fail;
   10287             : 
   10288           0 :                                 ret = -EINVAL;
   10289           0 :                                 if (!filter->path.dentry ||
   10290           0 :                                     !S_ISREG(d_inode(filter->path.dentry)
   10291             :                                              ->i_mode))
   10292           0 :                                         goto fail;
   10293             : 
   10294           0 :                                 event->addr_filters.nr_file_filters++;
   10295             :                         }
   10296             : 
   10297             :                         /* ready to consume more filters */
   10298             :                         state = IF_STATE_ACTION;
   10299             :                         filter = NULL;
   10300             :                 }
   10301             :         }
   10302             : 
   10303           0 :         if (state != IF_STATE_ACTION)
   10304           0 :                 goto fail;
   10305             : 
   10306           0 :         kfree(filename);
   10307           0 :         kfree(orig);
   10308             : 
   10309           0 :         return 0;
   10310             : 
   10311           0 : fail:
   10312           0 :         kfree(filename);
   10313           0 :         free_filters_list(filters);
   10314           0 :         kfree(orig);
   10315             : 
   10316           0 :         return ret;
   10317             : }
   10318             : 
   10319             : static int
   10320           0 : perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
   10321             : {
   10322           0 :         LIST_HEAD(filters);
   10323           0 :         int ret;
   10324             : 
   10325             :         /*
   10326             :          * Since this is called in perf_ioctl() path, we're already holding
   10327             :          * ctx::mutex.
   10328             :          */
   10329           0 :         lockdep_assert_held(&event->ctx->mutex);
   10330             : 
   10331           0 :         if (WARN_ON_ONCE(event->parent))
   10332             :                 return -EINVAL;
   10333             : 
   10334           0 :         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
   10335           0 :         if (ret)
   10336           0 :                 goto fail_clear_files;
   10337             : 
   10338           0 :         ret = event->pmu->addr_filters_validate(&filters);
   10339           0 :         if (ret)
   10340           0 :                 goto fail_free_filters;
   10341             : 
   10342             :         /* remove existing filters, if any */
   10343           0 :         perf_addr_filters_splice(event, &filters);
   10344             : 
   10345             :         /* install new filters */
   10346           0 :         perf_event_for_each_child(event, perf_event_addr_filters_apply);
   10347             : 
   10348           0 :         return ret;
   10349             : 
   10350           0 : fail_free_filters:
   10351           0 :         free_filters_list(&filters);
   10352             : 
   10353           0 : fail_clear_files:
   10354           0 :         event->addr_filters.nr_file_filters = 0;
   10355             : 
   10356           0 :         return ret;
   10357             : }
   10358             : 
   10359           0 : static int perf_event_set_filter(struct perf_event *event, void __user *arg)
   10360             : {
   10361           0 :         int ret = -EINVAL;
   10362           0 :         char *filter_str;
   10363             : 
   10364           0 :         filter_str = strndup_user(arg, PAGE_SIZE);
   10365           0 :         if (IS_ERR(filter_str))
   10366           0 :                 return PTR_ERR(filter_str);
   10367             : 
   10368             : #ifdef CONFIG_EVENT_TRACING
   10369           0 :         if (perf_event_is_tracing(event)) {
   10370           0 :                 struct perf_event_context *ctx = event->ctx;
   10371             : 
   10372             :                 /*
   10373             :                  * Beware, here be dragons!!
   10374             :                  *
   10375             :                  * the tracepoint muck will deadlock against ctx->mutex, but
   10376             :                  * the tracepoint stuff does not actually need it. So
   10377             :                  * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
   10378             :                  * already have a reference on ctx.
   10379             :                  *
   10380             :                  * This can result in event getting moved to a different ctx,
   10381             :                  * but that does not affect the tracepoint state.
   10382             :                  */
   10383           0 :                 mutex_unlock(&ctx->mutex);
   10384           0 :                 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
   10385           0 :                 mutex_lock(&ctx->mutex);
   10386             :         } else
   10387             : #endif
   10388           0 :         if (has_addr_filter(event))
   10389           0 :                 ret = perf_event_set_addr_filter(event, filter_str);
   10390             : 
   10391           0 :         kfree(filter_str);
   10392           0 :         return ret;
   10393             : }
   10394             : 
   10395             : /*
   10396             :  * hrtimer based swevent callback
   10397             :  */
   10398             : 
   10399           0 : static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
   10400             : {
   10401           0 :         enum hrtimer_restart ret = HRTIMER_RESTART;
   10402           0 :         struct perf_sample_data data;
   10403           0 :         struct pt_regs *regs;
   10404           0 :         struct perf_event *event;
   10405           0 :         u64 period;
   10406             : 
   10407           0 :         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
   10408             : 
   10409           0 :         if (event->state != PERF_EVENT_STATE_ACTIVE)
   10410             :                 return HRTIMER_NORESTART;
   10411             : 
   10412           0 :         event->pmu->read(event);
   10413             : 
   10414           0 :         perf_sample_data_init(&data, 0, event->hw.last_period);
   10415           0 :         regs = get_irq_regs();
   10416             : 
   10417           0 :         if (regs && !perf_exclude_event(event, regs)) {
   10418           0 :                 if (!(event->attr.exclude_idle && is_idle_task(current)))
   10419           0 :                         if (__perf_event_overflow(event, 1, &data, regs))
   10420           0 :                                 ret = HRTIMER_NORESTART;
   10421             :         }
   10422             : 
   10423           0 :         period = max_t(u64, 10000, event->hw.sample_period);
   10424           0 :         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
   10425             : 
   10426           0 :         return ret;
   10427             : }
   10428             : 
   10429           0 : static void perf_swevent_start_hrtimer(struct perf_event *event)
   10430             : {
   10431           0 :         struct hw_perf_event *hwc = &event->hw;
   10432           0 :         s64 period;
   10433             : 
   10434           0 :         if (!is_sampling_event(event))
   10435             :                 return;
   10436             : 
   10437           0 :         period = local64_read(&hwc->period_left);
   10438           0 :         if (period) {
   10439           0 :                 if (period < 0)
   10440           0 :                         period = 10000;
   10441             : 
   10442           0 :                 local64_set(&hwc->period_left, 0);
   10443             :         } else {
   10444           0 :                 period = max_t(u64, 10000, hwc->sample_period);
   10445             :         }
   10446           0 :         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
   10447             :                       HRTIMER_MODE_REL_PINNED_HARD);
   10448             : }
   10449             : 
   10450           0 : static void perf_swevent_cancel_hrtimer(struct perf_event *event)
   10451             : {
   10452           0 :         struct hw_perf_event *hwc = &event->hw;
   10453             : 
   10454           0 :         if (is_sampling_event(event)) {
   10455           0 :                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
   10456           0 :                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
   10457             : 
   10458           0 :                 hrtimer_cancel(&hwc->hrtimer);
   10459             :         }
   10460           0 : }
   10461             : 
   10462           0 : static void perf_swevent_init_hrtimer(struct perf_event *event)
   10463             : {
   10464           0 :         struct hw_perf_event *hwc = &event->hw;
   10465             : 
   10466           0 :         if (!is_sampling_event(event))
   10467             :                 return;
   10468             : 
   10469           0 :         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
   10470           0 :         hwc->hrtimer.function = perf_swevent_hrtimer;
   10471             : 
   10472             :         /*
   10473             :          * Since hrtimers have a fixed rate, we can do a static freq->period
   10474             :          * mapping and avoid the whole period adjust feedback stuff.
   10475             :          */
   10476           0 :         if (event->attr.freq) {
   10477           0 :                 long freq = event->attr.sample_freq;
   10478             : 
   10479           0 :                 event->attr.sample_period = NSEC_PER_SEC / freq;
   10480           0 :                 hwc->sample_period = event->attr.sample_period;
   10481           0 :                 local64_set(&hwc->period_left, hwc->sample_period);
   10482           0 :                 hwc->last_period = hwc->sample_period;
   10483           0 :                 event->attr.freq = 0;
   10484             :         }
   10485             : }
   10486             : 
   10487             : /*
   10488             :  * Software event: cpu wall time clock
   10489             :  */
   10490             : 
   10491           0 : static void cpu_clock_event_update(struct perf_event *event)
   10492             : {
   10493           0 :         s64 prev;
   10494           0 :         u64 now;
   10495             : 
   10496           0 :         now = local_clock();
   10497           0 :         prev = local64_xchg(&event->hw.prev_count, now);
   10498           0 :         local64_add(now - prev, &event->count);
   10499           0 : }
   10500             : 
   10501           0 : static void cpu_clock_event_start(struct perf_event *event, int flags)
   10502             : {
   10503           0 :         local64_set(&event->hw.prev_count, local_clock());
   10504           0 :         perf_swevent_start_hrtimer(event);
   10505           0 : }
   10506             : 
   10507           0 : static void cpu_clock_event_stop(struct perf_event *event, int flags)
   10508             : {
   10509           0 :         perf_swevent_cancel_hrtimer(event);
   10510           0 :         cpu_clock_event_update(event);
   10511           0 : }
   10512             : 
   10513           0 : static int cpu_clock_event_add(struct perf_event *event, int flags)
   10514             : {
   10515           0 :         if (flags & PERF_EF_START)
   10516           0 :                 cpu_clock_event_start(event, flags);
   10517           0 :         perf_event_update_userpage(event);
   10518             : 
   10519           0 :         return 0;
   10520             : }
   10521             : 
   10522           0 : static void cpu_clock_event_del(struct perf_event *event, int flags)
   10523             : {
   10524           0 :         cpu_clock_event_stop(event, flags);
   10525           0 : }
   10526             : 
   10527           0 : static void cpu_clock_event_read(struct perf_event *event)
   10528             : {
   10529           0 :         cpu_clock_event_update(event);
   10530           0 : }
   10531             : 
   10532           0 : static int cpu_clock_event_init(struct perf_event *event)
   10533             : {
   10534           0 :         if (event->attr.type != PERF_TYPE_SOFTWARE)
   10535             :                 return -ENOENT;
   10536             : 
   10537           0 :         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
   10538             :                 return -ENOENT;
   10539             : 
   10540             :         /*
   10541             :          * no branch sampling for software events
   10542             :          */
   10543           0 :         if (has_branch_stack(event))
   10544             :                 return -EOPNOTSUPP;
   10545             : 
   10546           0 :         perf_swevent_init_hrtimer(event);
   10547             : 
   10548           0 :         return 0;
   10549             : }
   10550             : 
   10551             : static struct pmu perf_cpu_clock = {
   10552             :         .task_ctx_nr    = perf_sw_context,
   10553             : 
   10554             :         .capabilities   = PERF_PMU_CAP_NO_NMI,
   10555             : 
   10556             :         .event_init     = cpu_clock_event_init,
   10557             :         .add            = cpu_clock_event_add,
   10558             :         .del            = cpu_clock_event_del,
   10559             :         .start          = cpu_clock_event_start,
   10560             :         .stop           = cpu_clock_event_stop,
   10561             :         .read           = cpu_clock_event_read,
   10562             : };
   10563             : 
   10564             : /*
   10565             :  * Software event: task time clock
   10566             :  */
   10567             : 
   10568           0 : static void task_clock_event_update(struct perf_event *event, u64 now)
   10569             : {
   10570           0 :         u64 prev;
   10571           0 :         s64 delta;
   10572             : 
   10573           0 :         prev = local64_xchg(&event->hw.prev_count, now);
   10574           0 :         delta = now - prev;
   10575           0 :         local64_add(delta, &event->count);
   10576           0 : }
   10577             : 
   10578           0 : static void task_clock_event_start(struct perf_event *event, int flags)
   10579             : {
   10580           0 :         local64_set(&event->hw.prev_count, event->ctx->time);
   10581           0 :         perf_swevent_start_hrtimer(event);
   10582           0 : }
   10583             : 
   10584           0 : static void task_clock_event_stop(struct perf_event *event, int flags)
   10585             : {
   10586           0 :         perf_swevent_cancel_hrtimer(event);
   10587           0 :         task_clock_event_update(event, event->ctx->time);
   10588           0 : }
   10589             : 
   10590           0 : static int task_clock_event_add(struct perf_event *event, int flags)
   10591             : {
   10592           0 :         if (flags & PERF_EF_START)
   10593           0 :                 task_clock_event_start(event, flags);
   10594           0 :         perf_event_update_userpage(event);
   10595             : 
   10596           0 :         return 0;
   10597             : }
   10598             : 
   10599           0 : static void task_clock_event_del(struct perf_event *event, int flags)
   10600             : {
   10601           0 :         task_clock_event_stop(event, PERF_EF_UPDATE);
   10602           0 : }
   10603             : 
   10604           0 : static void task_clock_event_read(struct perf_event *event)
   10605             : {
   10606           0 :         u64 now = perf_clock();
   10607           0 :         u64 delta = now - event->ctx->timestamp;
   10608           0 :         u64 time = event->ctx->time + delta;
   10609             : 
   10610           0 :         task_clock_event_update(event, time);
   10611           0 : }
   10612             : 
   10613           0 : static int task_clock_event_init(struct perf_event *event)
   10614             : {
   10615           0 :         if (event->attr.type != PERF_TYPE_SOFTWARE)
   10616             :                 return -ENOENT;
   10617             : 
   10618           0 :         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
   10619             :                 return -ENOENT;
   10620             : 
   10621             :         /*
   10622             :          * no branch sampling for software events
   10623             :          */
   10624           0 :         if (has_branch_stack(event))
   10625             :                 return -EOPNOTSUPP;
   10626             : 
   10627           0 :         perf_swevent_init_hrtimer(event);
   10628             : 
   10629           0 :         return 0;
   10630             : }
   10631             : 
   10632             : static struct pmu perf_task_clock = {
   10633             :         .task_ctx_nr    = perf_sw_context,
   10634             : 
   10635             :         .capabilities   = PERF_PMU_CAP_NO_NMI,
   10636             : 
   10637             :         .event_init     = task_clock_event_init,
   10638             :         .add            = task_clock_event_add,
   10639             :         .del            = task_clock_event_del,
   10640             :         .start          = task_clock_event_start,
   10641             :         .stop           = task_clock_event_stop,
   10642             :         .read           = task_clock_event_read,
   10643             : };
   10644             : 
   10645           0 : static void perf_pmu_nop_void(struct pmu *pmu)
   10646             : {
   10647           0 : }
   10648             : 
   10649           0 : static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
   10650             : {
   10651           0 : }
   10652             : 
   10653           0 : static int perf_pmu_nop_int(struct pmu *pmu)
   10654             : {
   10655           0 :         return 0;
   10656             : }
   10657             : 
   10658           0 : static int perf_event_nop_int(struct perf_event *event, u64 value)
   10659             : {
   10660           0 :         return 0;
   10661             : }
   10662             : 
   10663             : static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
   10664             : 
   10665           0 : static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
   10666             : {
   10667           0 :         __this_cpu_write(nop_txn_flags, flags);
   10668             : 
   10669           0 :         if (flags & ~PERF_PMU_TXN_ADD)
   10670             :                 return;
   10671             : 
   10672           0 :         perf_pmu_disable(pmu);
   10673             : }
   10674             : 
   10675           0 : static int perf_pmu_commit_txn(struct pmu *pmu)
   10676             : {
   10677           0 :         unsigned int flags = __this_cpu_read(nop_txn_flags);
   10678             : 
   10679           0 :         __this_cpu_write(nop_txn_flags, 0);
   10680             : 
   10681           0 :         if (flags & ~PERF_PMU_TXN_ADD)
   10682             :                 return 0;
   10683             : 
   10684           0 :         perf_pmu_enable(pmu);
   10685           0 :         return 0;
   10686             : }
   10687             : 
   10688           0 : static void perf_pmu_cancel_txn(struct pmu *pmu)
   10689             : {
   10690           0 :         unsigned int flags =  __this_cpu_read(nop_txn_flags);
   10691             : 
   10692           0 :         __this_cpu_write(nop_txn_flags, 0);
   10693             : 
   10694           0 :         if (flags & ~PERF_PMU_TXN_ADD)
   10695             :                 return;
   10696             : 
   10697           0 :         perf_pmu_enable(pmu);
   10698             : }
   10699             : 
   10700           0 : static int perf_event_idx_default(struct perf_event *event)
   10701             : {
   10702           0 :         return 0;
   10703             : }
   10704             : 
   10705             : /*
   10706             :  * Ensures all contexts with the same task_ctx_nr have the same
   10707             :  * pmu_cpu_context too.
   10708             :  */
   10709           7 : static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
   10710             : {
   10711           7 :         struct pmu *pmu;
   10712             : 
   10713           7 :         if (ctxn < 0)
   10714             :                 return NULL;
   10715             : 
   10716          12 :         list_for_each_entry(pmu, &pmus, entry) {
   10717          10 :                 if (pmu->task_ctx_nr == ctxn)
   10718           5 :                         return pmu->pmu_cpu_context;
   10719             :         }
   10720             : 
   10721             :         return NULL;
   10722             : }
   10723             : 
   10724           0 : static void free_pmu_context(struct pmu *pmu)
   10725             : {
   10726             :         /*
   10727             :          * Static contexts such as perf_sw_context have a global lifetime
   10728             :          * and may be shared between different PMUs. Avoid freeing them
   10729             :          * when a single PMU is going away.
   10730             :          */
   10731           0 :         if (pmu->task_ctx_nr > perf_invalid_context)
   10732             :                 return;
   10733             : 
   10734           0 :         free_percpu(pmu->pmu_cpu_context);
   10735             : }
   10736             : 
   10737             : /*
   10738             :  * Let userspace know that this PMU supports address range filtering:
   10739             :  */
   10740           0 : static ssize_t nr_addr_filters_show(struct device *dev,
   10741             :                                     struct device_attribute *attr,
   10742             :                                     char *page)
   10743             : {
   10744           0 :         struct pmu *pmu = dev_get_drvdata(dev);
   10745             : 
   10746           0 :         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
   10747             : }
   10748             : DEVICE_ATTR_RO(nr_addr_filters);
   10749             : 
   10750             : static struct idr pmu_idr;
   10751             : 
   10752             : static ssize_t
   10753           0 : type_show(struct device *dev, struct device_attribute *attr, char *page)
   10754             : {
   10755           0 :         struct pmu *pmu = dev_get_drvdata(dev);
   10756             : 
   10757           0 :         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
   10758             : }
   10759             : static DEVICE_ATTR_RO(type);
   10760             : 
   10761             : static ssize_t
   10762           0 : perf_event_mux_interval_ms_show(struct device *dev,
   10763             :                                 struct device_attribute *attr,
   10764             :                                 char *page)
   10765             : {
   10766           0 :         struct pmu *pmu = dev_get_drvdata(dev);
   10767             : 
   10768           0 :         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
   10769             : }
   10770             : 
   10771             : static DEFINE_MUTEX(mux_interval_mutex);
   10772             : 
   10773             : static ssize_t
   10774           0 : perf_event_mux_interval_ms_store(struct device *dev,
   10775             :                                  struct device_attribute *attr,
   10776             :                                  const char *buf, size_t count)
   10777             : {
   10778           0 :         struct pmu *pmu = dev_get_drvdata(dev);
   10779           0 :         int timer, cpu, ret;
   10780             : 
   10781           0 :         ret = kstrtoint(buf, 0, &timer);
   10782           0 :         if (ret)
   10783           0 :                 return ret;
   10784             : 
   10785           0 :         if (timer < 1)
   10786             :                 return -EINVAL;
   10787             : 
   10788             :         /* same value, noting to do */
   10789           0 :         if (timer == pmu->hrtimer_interval_ms)
   10790           0 :                 return count;
   10791             : 
   10792           0 :         mutex_lock(&mux_interval_mutex);
   10793           0 :         pmu->hrtimer_interval_ms = timer;
   10794             : 
   10795             :         /* update all cpuctx for this PMU */
   10796           0 :         cpus_read_lock();
   10797           0 :         for_each_online_cpu(cpu) {
   10798           0 :                 struct perf_cpu_context *cpuctx;
   10799           0 :                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
   10800           0 :                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
   10801             : 
   10802           0 :                 cpu_function_call(cpu,
   10803             :                         (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
   10804             :         }
   10805           0 :         cpus_read_unlock();
   10806           0 :         mutex_unlock(&mux_interval_mutex);
   10807             : 
   10808           0 :         return count;
   10809             : }
   10810             : static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
   10811             : 
   10812             : static struct attribute *pmu_dev_attrs[] = {
   10813             :         &dev_attr_type.attr,
   10814             :         &dev_attr_perf_event_mux_interval_ms.attr,
   10815             :         NULL,
   10816             : };
   10817             : ATTRIBUTE_GROUPS(pmu_dev);
   10818             : 
   10819             : static int pmu_bus_running;
   10820             : static struct bus_type pmu_bus = {
   10821             :         .name           = "event_source",
   10822             :         .dev_groups     = pmu_dev_groups,
   10823             : };
   10824             : 
   10825           0 : static void pmu_dev_release(struct device *dev)
   10826             : {
   10827           0 :         kfree(dev);
   10828           0 : }
   10829             : 
   10830           5 : static int pmu_dev_alloc(struct pmu *pmu)
   10831             : {
   10832           5 :         int ret = -ENOMEM;
   10833             : 
   10834           5 :         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
   10835           5 :         if (!pmu->dev)
   10836           0 :                 goto out;
   10837             : 
   10838           5 :         pmu->dev->groups = pmu->attr_groups;
   10839           5 :         device_initialize(pmu->dev);
   10840           5 :         ret = dev_set_name(pmu->dev, "%s", pmu->name);
   10841           5 :         if (ret)
   10842           0 :                 goto free_dev;
   10843             : 
   10844           5 :         dev_set_drvdata(pmu->dev, pmu);
   10845           5 :         pmu->dev->bus = &pmu_bus;
   10846           5 :         pmu->dev->release = pmu_dev_release;
   10847           5 :         ret = device_add(pmu->dev);
   10848           5 :         if (ret)
   10849           0 :                 goto free_dev;
   10850             : 
   10851             :         /* For PMUs with address filters, throw in an extra attribute: */
   10852           5 :         if (pmu->nr_addr_filters)
   10853           0 :                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
   10854             : 
   10855           5 :         if (ret)
   10856           0 :                 goto del_dev;
   10857             : 
   10858           5 :         if (pmu->attr_update)
   10859           2 :                 ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
   10860             : 
   10861           5 :         if (ret)
   10862           0 :                 goto del_dev;
   10863             : 
   10864           5 : out:
   10865           5 :         return ret;
   10866             : 
   10867           0 : del_dev:
   10868           0 :         device_del(pmu->dev);
   10869             : 
   10870           0 : free_dev:
   10871           0 :         put_device(pmu->dev);
   10872           0 :         goto out;
   10873             : }
   10874             : 
   10875             : static struct lock_class_key cpuctx_mutex;
   10876             : static struct lock_class_key cpuctx_lock;
   10877             : 
   10878           7 : int perf_pmu_register(struct pmu *pmu, const char *name, int type)
   10879             : {
   10880           7 :         int cpu, ret, max = PERF_TYPE_MAX;
   10881             : 
   10882           7 :         mutex_lock(&pmus_lock);
   10883           7 :         ret = -ENOMEM;
   10884           7 :         pmu->pmu_disable_count = alloc_percpu(int);
   10885           7 :         if (!pmu->pmu_disable_count)
   10886           0 :                 goto unlock;
   10887             : 
   10888           7 :         pmu->type = -1;
   10889           7 :         if (!name)
   10890           2 :                 goto skip_type;
   10891           5 :         pmu->name = name;
   10892             : 
   10893           5 :         if (type != PERF_TYPE_SOFTWARE) {
   10894           4 :                 if (type >= 0)
   10895           3 :                         max = type;
   10896             : 
   10897           4 :                 ret = idr_alloc(&pmu_idr, pmu, max, 0, GFP_KERNEL);
   10898           4 :                 if (ret < 0)
   10899           0 :                         goto free_pdc;
   10900             : 
   10901           4 :                 WARN_ON(type >= 0 && ret != type);
   10902             : 
   10903             :                 type = ret;
   10904             :         }
   10905           5 :         pmu->type = type;
   10906             : 
   10907           5 :         if (pmu_bus_running) {
   10908           0 :                 ret = pmu_dev_alloc(pmu);
   10909           0 :                 if (ret)
   10910           0 :                         goto free_idr;
   10911             :         }
   10912             : 
   10913           5 : skip_type:
   10914           7 :         if (pmu->task_ctx_nr == perf_hw_context) {
   10915           1 :                 static int hw_context_taken = 0;
   10916             : 
   10917             :                 /*
   10918             :                  * Other than systems with heterogeneous CPUs, it never makes
   10919             :                  * sense for two PMUs to share perf_hw_context. PMUs which are
   10920             :                  * uncore must use perf_invalid_context.
   10921             :                  */
   10922           2 :                 if (WARN_ON_ONCE(hw_context_taken &&
   10923             :                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
   10924           0 :                         pmu->task_ctx_nr = perf_invalid_context;
   10925             : 
   10926           1 :                 hw_context_taken = 1;
   10927             :         }
   10928             : 
   10929           7 :         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
   10930           7 :         if (pmu->pmu_cpu_context)
   10931           5 :                 goto got_cpu_context;
   10932             : 
   10933           2 :         ret = -ENOMEM;
   10934           2 :         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
   10935           2 :         if (!pmu->pmu_cpu_context)
   10936           0 :                 goto free_dev;
   10937             : 
   10938          10 :         for_each_possible_cpu(cpu) {
   10939           8 :                 struct perf_cpu_context *cpuctx;
   10940             : 
   10941           8 :                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
   10942           8 :                 __perf_event_init_context(&cpuctx->ctx);
   10943           8 :                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
   10944           8 :                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
   10945           8 :                 cpuctx->ctx.pmu = pmu;
   10946           8 :                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
   10947             : 
   10948           8 :                 __perf_mux_hrtimer_init(cpuctx, cpu);
   10949             : 
   10950           8 :                 cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
   10951           8 :                 cpuctx->heap = cpuctx->heap_default;
   10952             :         }
   10953             : 
   10954           2 : got_cpu_context:
   10955           7 :         if (!pmu->start_txn) {
   10956           6 :                 if (pmu->pmu_enable) {
   10957             :                         /*
   10958             :                          * If we have pmu_enable/pmu_disable calls, install
   10959             :                          * transaction stubs that use that to try and batch
   10960             :                          * hardware accesses.
   10961             :                          */
   10962           0 :                         pmu->start_txn  = perf_pmu_start_txn;
   10963           0 :                         pmu->commit_txn = perf_pmu_commit_txn;
   10964           0 :                         pmu->cancel_txn = perf_pmu_cancel_txn;
   10965             :                 } else {
   10966           6 :                         pmu->start_txn  = perf_pmu_nop_txn;
   10967           6 :                         pmu->commit_txn = perf_pmu_nop_int;
   10968           6 :                         pmu->cancel_txn = perf_pmu_nop_void;
   10969             :                 }
   10970             :         }
   10971             : 
   10972           7 :         if (!pmu->pmu_enable) {
   10973           6 :                 pmu->pmu_enable  = perf_pmu_nop_void;
   10974           6 :                 pmu->pmu_disable = perf_pmu_nop_void;
   10975             :         }
   10976             : 
   10977           7 :         if (!pmu->check_period)
   10978           6 :                 pmu->check_period = perf_event_nop_int;
   10979             : 
   10980           7 :         if (!pmu->event_idx)
   10981           6 :                 pmu->event_idx = perf_event_idx_default;
   10982             : 
   10983             :         /*
   10984             :          * Ensure the TYPE_SOFTWARE PMUs are at the head of the list,
   10985             :          * since these cannot be in the IDR. This way the linear search
   10986             :          * is fast, provided a valid software event is provided.
   10987             :          */
   10988           7 :         if (type == PERF_TYPE_SOFTWARE || !name)
   10989           3 :                 list_add_rcu(&pmu->entry, &pmus);
   10990             :         else
   10991           4 :                 list_add_tail_rcu(&pmu->entry, &pmus);
   10992             : 
   10993           7 :         atomic_set(&pmu->exclusive_cnt, 0);
   10994           7 :         ret = 0;
   10995           7 : unlock:
   10996           7 :         mutex_unlock(&pmus_lock);
   10997             : 
   10998           7 :         return ret;
   10999             : 
   11000           0 : free_dev:
   11001           0 :         device_del(pmu->dev);
   11002           0 :         put_device(pmu->dev);
   11003             : 
   11004           0 : free_idr:
   11005           0 :         if (pmu->type != PERF_TYPE_SOFTWARE)
   11006           0 :                 idr_remove(&pmu_idr, pmu->type);
   11007             : 
   11008           0 : free_pdc:
   11009           0 :         free_percpu(pmu->pmu_disable_count);
   11010           0 :         goto unlock;
   11011             : }
   11012             : EXPORT_SYMBOL_GPL(perf_pmu_register);
   11013             : 
   11014           0 : void perf_pmu_unregister(struct pmu *pmu)
   11015             : {
   11016           0 :         mutex_lock(&pmus_lock);
   11017           0 :         list_del_rcu(&pmu->entry);
   11018             : 
   11019             :         /*
   11020             :          * We dereference the pmu list under both SRCU and regular RCU, so
   11021             :          * synchronize against both of those.
   11022             :          */
   11023           0 :         synchronize_srcu(&pmus_srcu);
   11024           0 :         synchronize_rcu();
   11025             : 
   11026           0 :         free_percpu(pmu->pmu_disable_count);
   11027           0 :         if (pmu->type != PERF_TYPE_SOFTWARE)
   11028           0 :                 idr_remove(&pmu_idr, pmu->type);
   11029           0 :         if (pmu_bus_running) {
   11030           0 :                 if (pmu->nr_addr_filters)
   11031           0 :                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
   11032           0 :                 device_del(pmu->dev);
   11033           0 :                 put_device(pmu->dev);
   11034             :         }
   11035           0 :         free_pmu_context(pmu);
   11036           0 :         mutex_unlock(&pmus_lock);
   11037           0 : }
   11038             : EXPORT_SYMBOL_GPL(perf_pmu_unregister);
   11039             : 
   11040           0 : static inline bool has_extended_regs(struct perf_event *event)
   11041             : {
   11042           0 :         return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
   11043           0 :                (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
   11044             : }
   11045             : 
   11046           0 : static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
   11047             : {
   11048           0 :         struct perf_event_context *ctx = NULL;
   11049           0 :         int ret;
   11050             : 
   11051           0 :         if (!try_module_get(pmu->module))
   11052             :                 return -ENODEV;
   11053             : 
   11054             :         /*
   11055             :          * A number of pmu->event_init() methods iterate the sibling_list to,
   11056             :          * for example, validate if the group fits on the PMU. Therefore,
   11057             :          * if this is a sibling event, acquire the ctx->mutex to protect
   11058             :          * the sibling_list.
   11059             :          */
   11060           0 :         if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
   11061             :                 /*
   11062             :                  * This ctx->mutex can nest when we're called through
   11063             :                  * inheritance. See the perf_event_ctx_lock_nested() comment.
   11064             :                  */
   11065           0 :                 ctx = perf_event_ctx_lock_nested(event->group_leader,
   11066             :                                                  SINGLE_DEPTH_NESTING);
   11067           0 :                 BUG_ON(!ctx);
   11068             :         }
   11069             : 
   11070           0 :         event->pmu = pmu;
   11071           0 :         ret = pmu->event_init(event);
   11072             : 
   11073           0 :         if (ctx)
   11074           0 :                 perf_event_ctx_unlock(event->group_leader, ctx);
   11075             : 
   11076           0 :         if (!ret) {
   11077           0 :                 if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
   11078           0 :                     has_extended_regs(event))
   11079           0 :                         ret = -EOPNOTSUPP;
   11080             : 
   11081           0 :                 if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
   11082           0 :                     event_has_any_exclude_flag(event))
   11083             :                         ret = -EINVAL;
   11084             : 
   11085           0 :                 if (ret && event->destroy)
   11086           0 :                         event->destroy(event);
   11087             :         }
   11088             : 
   11089           0 :         if (ret)
   11090           0 :                 module_put(pmu->module);
   11091             : 
   11092           0 :         return ret;
   11093             : }
   11094             : 
   11095           0 : static struct pmu *perf_init_event(struct perf_event *event)
   11096             : {
   11097           0 :         int idx, type, ret;
   11098           0 :         struct pmu *pmu;
   11099             : 
   11100           0 :         idx = srcu_read_lock(&pmus_srcu);
   11101             : 
   11102             :         /* Try parent's PMU first: */
   11103           0 :         if (event->parent && event->parent->pmu) {
   11104           0 :                 pmu = event->parent->pmu;
   11105           0 :                 ret = perf_try_init_event(pmu, event);
   11106           0 :                 if (!ret)
   11107           0 :                         goto unlock;
   11108             :         }
   11109             : 
   11110             :         /*
   11111             :          * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
   11112             :          * are often aliases for PERF_TYPE_RAW.
   11113             :          */
   11114           0 :         type = event->attr.type;
   11115           0 :         if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
   11116           0 :                 type = PERF_TYPE_RAW;
   11117             : 
   11118           0 : again:
   11119           0 :         rcu_read_lock();
   11120           0 :         pmu = idr_find(&pmu_idr, type);
   11121           0 :         rcu_read_unlock();
   11122           0 :         if (pmu) {
   11123           0 :                 ret = perf_try_init_event(pmu, event);
   11124           0 :                 if (ret == -ENOENT && event->attr.type != type) {
   11125           0 :                         type = event->attr.type;
   11126           0 :                         goto again;
   11127             :                 }
   11128             : 
   11129           0 :                 if (ret)
   11130           0 :                         pmu = ERR_PTR(ret);
   11131             : 
   11132           0 :                 goto unlock;
   11133             :         }
   11134             : 
   11135           0 :         list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
   11136           0 :                 ret = perf_try_init_event(pmu, event);
   11137           0 :                 if (!ret)
   11138           0 :                         goto unlock;
   11139             : 
   11140           0 :                 if (ret != -ENOENT) {
   11141           0 :                         pmu = ERR_PTR(ret);
   11142           0 :                         goto unlock;
   11143             :                 }
   11144             :         }
   11145           0 :         pmu = ERR_PTR(-ENOENT);
   11146           0 : unlock:
   11147           0 :         srcu_read_unlock(&pmus_srcu, idx);
   11148             : 
   11149           0 :         return pmu;
   11150             : }
   11151             : 
   11152           0 : static void attach_sb_event(struct perf_event *event)
   11153             : {
   11154           0 :         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
   11155             : 
   11156           0 :         raw_spin_lock(&pel->lock);
   11157           0 :         list_add_rcu(&event->sb_list, &pel->list);
   11158           0 :         raw_spin_unlock(&pel->lock);
   11159           0 : }
   11160             : 
   11161             : /*
   11162             :  * We keep a list of all !task (and therefore per-cpu) events
   11163             :  * that need to receive side-band records.
   11164             :  *
   11165             :  * This avoids having to scan all the various PMU per-cpu contexts
   11166             :  * looking for them.
   11167             :  */
   11168           0 : static void account_pmu_sb_event(struct perf_event *event)
   11169             : {
   11170           0 :         if (is_sb_event(event))
   11171           0 :                 attach_sb_event(event);
   11172           0 : }
   11173             : 
   11174           0 : static void account_event_cpu(struct perf_event *event, int cpu)
   11175             : {
   11176           0 :         if (event->parent)
   11177             :                 return;
   11178             : 
   11179           0 :         if (is_cgroup_event(event))
   11180           0 :                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
   11181             : }
   11182             : 
   11183             : /* Freq events need the tick to stay alive (see perf_event_task_tick). */
   11184             : static void account_freq_event_nohz(void)
   11185             : {
   11186             : #ifdef CONFIG_NO_HZ_FULL
   11187             :         /* Lock so we don't race with concurrent unaccount */
   11188             :         spin_lock(&nr_freq_lock);
   11189             :         if (atomic_inc_return(&nr_freq_events) == 1)
   11190             :                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
   11191             :         spin_unlock(&nr_freq_lock);
   11192             : #endif
   11193             : }
   11194             : 
   11195           0 : static void account_freq_event(void)
   11196             : {
   11197           0 :         if (tick_nohz_full_enabled())
   11198             :                 account_freq_event_nohz();
   11199             :         else
   11200           0 :                 atomic_inc(&nr_freq_events);
   11201           0 : }
   11202             : 
   11203             : 
   11204           0 : static void account_event(struct perf_event *event)
   11205             : {
   11206           0 :         bool inc = false;
   11207             : 
   11208           0 :         if (event->parent)
   11209             :                 return;
   11210             : 
   11211           0 :         if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
   11212           0 :                 inc = true;
   11213           0 :         if (event->attr.mmap || event->attr.mmap_data)
   11214           0 :                 atomic_inc(&nr_mmap_events);
   11215           0 :         if (event->attr.build_id)
   11216           0 :                 atomic_inc(&nr_build_id_events);
   11217           0 :         if (event->attr.comm)
   11218           0 :                 atomic_inc(&nr_comm_events);
   11219           0 :         if (event->attr.namespaces)
   11220           0 :                 atomic_inc(&nr_namespaces_events);
   11221           0 :         if (event->attr.cgroup)
   11222           0 :                 atomic_inc(&nr_cgroup_events);
   11223           0 :         if (event->attr.task)
   11224           0 :                 atomic_inc(&nr_task_events);
   11225           0 :         if (event->attr.freq)
   11226           0 :                 account_freq_event();
   11227           0 :         if (event->attr.context_switch) {
   11228           0 :                 atomic_inc(&nr_switch_events);
   11229           0 :                 inc = true;
   11230             :         }
   11231           0 :         if (has_branch_stack(event))
   11232           0 :                 inc = true;
   11233           0 :         if (is_cgroup_event(event))
   11234             :                 inc = true;
   11235           0 :         if (event->attr.ksymbol)
   11236           0 :                 atomic_inc(&nr_ksymbol_events);
   11237           0 :         if (event->attr.bpf_event)
   11238           0 :                 atomic_inc(&nr_bpf_events);
   11239           0 :         if (event->attr.text_poke)
   11240           0 :                 atomic_inc(&nr_text_poke_events);
   11241             : 
   11242           0 :         if (inc) {
   11243             :                 /*
   11244             :                  * We need the mutex here because static_branch_enable()
   11245             :                  * must complete *before* the perf_sched_count increment
   11246             :                  * becomes visible.
   11247             :                  */
   11248           0 :                 if (atomic_inc_not_zero(&perf_sched_count))
   11249           0 :                         goto enabled;
   11250             : 
   11251           0 :                 mutex_lock(&perf_sched_mutex);
   11252           0 :                 if (!atomic_read(&perf_sched_count)) {
   11253           0 :                         static_branch_enable(&perf_sched_events);
   11254             :                         /*
   11255             :                          * Guarantee that all CPUs observe they key change and
   11256             :                          * call the perf scheduling hooks before proceeding to
   11257             :                          * install events that need them.
   11258             :                          */
   11259           0 :                         synchronize_rcu();
   11260             :                 }
   11261             :                 /*
   11262             :                  * Now that we have waited for the sync_sched(), allow further
   11263             :                  * increments to by-pass the mutex.
   11264             :                  */
   11265           0 :                 atomic_inc(&perf_sched_count);
   11266           0 :                 mutex_unlock(&perf_sched_mutex);
   11267             :         }
   11268           0 : enabled:
   11269             : 
   11270           0 :         account_event_cpu(event, event->cpu);
   11271             : 
   11272           0 :         account_pmu_sb_event(event);
   11273             : }
   11274             : 
   11275             : /*
   11276             :  * Allocate and initialize an event structure
   11277             :  */
   11278             : static struct perf_event *
   11279           0 : perf_event_alloc(struct perf_event_attr *attr, int cpu,
   11280             :                  struct task_struct *task,
   11281             :                  struct perf_event *group_leader,
   11282             :                  struct perf_event *parent_event,
   11283             :                  perf_overflow_handler_t overflow_handler,
   11284             :                  void *context, int cgroup_fd)
   11285             : {
   11286           0 :         struct pmu *pmu;
   11287           0 :         struct perf_event *event;
   11288           0 :         struct hw_perf_event *hwc;
   11289           0 :         long err = -EINVAL;
   11290             : 
   11291           0 :         if ((unsigned)cpu >= nr_cpu_ids) {
   11292           0 :                 if (!task || cpu != -1)
   11293           0 :                         return ERR_PTR(-EINVAL);
   11294             :         }
   11295             : 
   11296           0 :         event = kzalloc(sizeof(*event), GFP_KERNEL);
   11297           0 :         if (!event)
   11298           0 :                 return ERR_PTR(-ENOMEM);
   11299             : 
   11300             :         /*
   11301             :          * Single events are their own group leaders, with an
   11302             :          * empty sibling list:
   11303             :          */
   11304           0 :         if (!group_leader)
   11305           0 :                 group_leader = event;
   11306             : 
   11307           0 :         mutex_init(&event->child_mutex);
   11308           0 :         INIT_LIST_HEAD(&event->child_list);
   11309             : 
   11310           0 :         INIT_LIST_HEAD(&event->event_entry);
   11311           0 :         INIT_LIST_HEAD(&event->sibling_list);
   11312           0 :         INIT_LIST_HEAD(&event->active_list);
   11313           0 :         init_event_group(event);
   11314           0 :         INIT_LIST_HEAD(&event->rb_entry);
   11315           0 :         INIT_LIST_HEAD(&event->active_entry);
   11316           0 :         INIT_LIST_HEAD(&event->addr_filters.list);
   11317           0 :         INIT_HLIST_NODE(&event->hlist_entry);
   11318             : 
   11319             : 
   11320           0 :         init_waitqueue_head(&event->waitq);
   11321           0 :         event->pending_disable = -1;
   11322           0 :         init_irq_work(&event->pending, perf_pending_event);
   11323             : 
   11324           0 :         mutex_init(&event->mmap_mutex);
   11325           0 :         raw_spin_lock_init(&event->addr_filters.lock);
   11326             : 
   11327           0 :         atomic_long_set(&event->refcount, 1);
   11328           0 :         event->cpu           = cpu;
   11329           0 :         event->attr          = *attr;
   11330           0 :         event->group_leader  = group_leader;
   11331           0 :         event->pmu           = NULL;
   11332           0 :         event->oncpu         = -1;
   11333             : 
   11334           0 :         event->parent                = parent_event;
   11335             : 
   11336           0 :         event->ns            = get_pid_ns(task_active_pid_ns(current));
   11337           0 :         event->id            = atomic64_inc_return(&perf_event_id);
   11338             : 
   11339           0 :         event->state         = PERF_EVENT_STATE_INACTIVE;
   11340             : 
   11341           0 :         if (task) {
   11342           0 :                 event->attach_state = PERF_ATTACH_TASK;
   11343             :                 /*
   11344             :                  * XXX pmu::event_init needs to know what task to account to
   11345             :                  * and we cannot use the ctx information because we need the
   11346             :                  * pmu before we get a ctx.
   11347             :                  */
   11348           0 :                 event->hw.target = get_task_struct(task);
   11349             :         }
   11350             : 
   11351           0 :         event->clock = &local_clock;
   11352           0 :         if (parent_event)
   11353           0 :                 event->clock = parent_event->clock;
   11354             : 
   11355           0 :         if (!overflow_handler && parent_event) {
   11356           0 :                 overflow_handler = parent_event->overflow_handler;
   11357           0 :                 context = parent_event->overflow_handler_context;
   11358             : #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
   11359             :                 if (overflow_handler == bpf_overflow_handler) {
   11360             :                         struct bpf_prog *prog = parent_event->prog;
   11361             : 
   11362             :                         bpf_prog_inc(prog);
   11363             :                         event->prog = prog;
   11364             :                         event->orig_overflow_handler =
   11365             :                                 parent_event->orig_overflow_handler;
   11366             :                 }
   11367             : #endif
   11368             :         }
   11369             : 
   11370           0 :         if (overflow_handler) {
   11371           0 :                 event->overflow_handler      = overflow_handler;
   11372           0 :                 event->overflow_handler_context = context;
   11373           0 :         } else if (is_write_backward(event)){
   11374           0 :                 event->overflow_handler = perf_event_output_backward;
   11375           0 :                 event->overflow_handler_context = NULL;
   11376             :         } else {
   11377           0 :                 event->overflow_handler = perf_event_output_forward;
   11378           0 :                 event->overflow_handler_context = NULL;
   11379             :         }
   11380             : 
   11381           0 :         perf_event__state_init(event);
   11382             : 
   11383           0 :         pmu = NULL;
   11384             : 
   11385           0 :         hwc = &event->hw;
   11386           0 :         hwc->sample_period = attr->sample_period;
   11387           0 :         if (attr->freq && attr->sample_freq)
   11388           0 :                 hwc->sample_period = 1;
   11389           0 :         hwc->last_period = hwc->sample_period;
   11390             : 
   11391           0 :         local64_set(&hwc->period_left, hwc->sample_period);
   11392             : 
   11393             :         /*
   11394             :          * We currently do not support PERF_SAMPLE_READ on inherited events.
   11395             :          * See perf_output_read().
   11396             :          */
   11397           0 :         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
   11398           0 :                 goto err_ns;
   11399             : 
   11400           0 :         if (!has_branch_stack(event))
   11401           0 :                 event->attr.branch_sample_type = 0;
   11402             : 
   11403           0 :         pmu = perf_init_event(event);
   11404           0 :         if (IS_ERR(pmu)) {
   11405           0 :                 err = PTR_ERR(pmu);
   11406           0 :                 goto err_ns;
   11407             :         }
   11408             : 
   11409             :         /*
   11410             :          * Disallow uncore-cgroup events, they don't make sense as the cgroup will
   11411             :          * be different on other CPUs in the uncore mask.
   11412             :          */
   11413           0 :         if (pmu->task_ctx_nr == perf_invalid_context && cgroup_fd != -1) {
   11414           0 :                 err = -EINVAL;
   11415           0 :                 goto err_pmu;
   11416             :         }
   11417             : 
   11418           0 :         if (event->attr.aux_output &&
   11419           0 :             !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) {
   11420           0 :                 err = -EOPNOTSUPP;
   11421           0 :                 goto err_pmu;
   11422             :         }
   11423             : 
   11424           0 :         if (cgroup_fd != -1) {
   11425           0 :                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
   11426           0 :                 if (err)
   11427           0 :                         goto err_pmu;
   11428             :         }
   11429             : 
   11430           0 :         err = exclusive_event_init(event);
   11431           0 :         if (err)
   11432           0 :                 goto err_pmu;
   11433             : 
   11434           0 :         if (has_addr_filter(event)) {
   11435           0 :                 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
   11436             :                                                     sizeof(struct perf_addr_filter_range),
   11437             :                                                     GFP_KERNEL);
   11438           0 :                 if (!event->addr_filter_ranges) {
   11439           0 :                         err = -ENOMEM;
   11440           0 :                         goto err_per_task;
   11441             :                 }
   11442             : 
   11443             :                 /*
   11444             :                  * Clone the parent's vma offsets: they are valid until exec()
   11445             :                  * even if the mm is not shared with the parent.
   11446             :                  */
   11447           0 :                 if (event->parent) {
   11448           0 :                         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
   11449             : 
   11450           0 :                         raw_spin_lock_irq(&ifh->lock);
   11451           0 :                         memcpy(event->addr_filter_ranges,
   11452           0 :                                event->parent->addr_filter_ranges,
   11453           0 :                                pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
   11454           0 :                         raw_spin_unlock_irq(&ifh->lock);
   11455             :                 }
   11456             : 
   11457             :                 /* force hw sync on the address filters */
   11458           0 :                 event->addr_filters_gen = 1;
   11459             :         }
   11460             : 
   11461           0 :         if (!event->parent) {
   11462           0 :                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
   11463           0 :                         err = get_callchain_buffers(attr->sample_max_stack);
   11464           0 :                         if (err)
   11465           0 :                                 goto err_addr_filters;
   11466             :                 }
   11467             :         }
   11468             : 
   11469           0 :         err = security_perf_event_alloc(event);
   11470           0 :         if (err)
   11471           0 :                 goto err_callchain_buffer;
   11472             : 
   11473             :         /* symmetric to unaccount_event() in _free_event() */
   11474           0 :         account_event(event);
   11475             : 
   11476           0 :         return event;
   11477             : 
   11478           0 : err_callchain_buffer:
   11479           0 :         if (!event->parent) {
   11480           0 :                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
   11481           0 :                         put_callchain_buffers();
   11482             :         }
   11483           0 : err_addr_filters:
   11484           0 :         kfree(event->addr_filter_ranges);
   11485             : 
   11486           0 : err_per_task:
   11487           0 :         exclusive_event_destroy(event);
   11488             : 
   11489           0 : err_pmu:
   11490           0 :         if (is_cgroup_event(event))
   11491           0 :                 perf_detach_cgroup(event);
   11492           0 :         if (event->destroy)
   11493           0 :                 event->destroy(event);
   11494           0 :         module_put(pmu->module);
   11495           0 : err_ns:
   11496           0 :         if (event->ns)
   11497           0 :                 put_pid_ns(event->ns);
   11498           0 :         if (event->hw.target)
   11499           0 :                 put_task_struct(event->hw.target);
   11500           0 :         kfree(event);
   11501             : 
   11502           0 :         return ERR_PTR(err);
   11503             : }
   11504             : 
   11505           0 : static int perf_copy_attr(struct perf_event_attr __user *uattr,
   11506             :                           struct perf_event_attr *attr)
   11507             : {
   11508           0 :         u32 size;
   11509           0 :         int ret;
   11510             : 
   11511             :         /* Zero the full structure, so that a short copy will be nice. */
   11512           0 :         memset(attr, 0, sizeof(*attr));
   11513             : 
   11514           0 :         ret = get_user(size, &uattr->size);
   11515           0 :         if (ret)
   11516             :                 return ret;
   11517             : 
   11518             :         /* ABI compatibility quirk: */
   11519           0 :         if (!size)
   11520           0 :                 size = PERF_ATTR_SIZE_VER0;
   11521           0 :         if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
   11522           0 :                 goto err_size;
   11523             : 
   11524           0 :         ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
   11525           0 :         if (ret) {
   11526           0 :                 if (ret == -E2BIG)
   11527           0 :                         goto err_size;
   11528             :                 return ret;
   11529             :         }
   11530             : 
   11531           0 :         attr->size = size;
   11532             : 
   11533           0 :         if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
   11534             :                 return -EINVAL;
   11535             : 
   11536           0 :         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
   11537             :                 return -EINVAL;
   11538             : 
   11539           0 :         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
   11540             :                 return -EINVAL;
   11541             : 
   11542           0 :         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
   11543           0 :                 u64 mask = attr->branch_sample_type;
   11544             : 
   11545             :                 /* only using defined bits */
   11546           0 :                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
   11547             :                         return -EINVAL;
   11548             : 
   11549             :                 /* at least one branch bit must be set */
   11550           0 :                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
   11551             :                         return -EINVAL;
   11552             : 
   11553             :                 /* propagate priv level, when not set for branch */
   11554           0 :                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
   11555             : 
   11556             :                         /* exclude_kernel checked on syscall entry */
   11557           0 :                         if (!attr->exclude_kernel)
   11558           0 :                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
   11559             : 
   11560           0 :                         if (!attr->exclude_user)
   11561           0 :                                 mask |= PERF_SAMPLE_BRANCH_USER;
   11562             : 
   11563           0 :                         if (!attr->exclude_hv)
   11564           0 :                                 mask |= PERF_SAMPLE_BRANCH_HV;
   11565             :                         /*
   11566             :                          * adjust user setting (for HW filter setup)
   11567             :                          */
   11568           0 :                         attr->branch_sample_type = mask;
   11569             :                 }
   11570             :                 /* privileged levels capture (kernel, hv): check permissions */
   11571           0 :                 if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
   11572           0 :                         ret = perf_allow_kernel(attr);
   11573           0 :                         if (ret)
   11574             :                                 return ret;
   11575             :                 }
   11576             :         }
   11577             : 
   11578           0 :         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
   11579           0 :                 ret = perf_reg_validate(attr->sample_regs_user);
   11580           0 :                 if (ret)
   11581             :                         return ret;
   11582             :         }
   11583             : 
   11584           0 :         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
   11585           0 :                 if (!arch_perf_have_user_stack_dump())
   11586             :                         return -ENOSYS;
   11587             : 
   11588             :                 /*
   11589             :                  * We have __u32 type for the size, but so far
   11590             :                  * we can only use __u16 as maximum due to the
   11591             :                  * __u16 sample size limit.
   11592             :                  */
   11593           0 :                 if (attr->sample_stack_user >= USHRT_MAX)
   11594             :                         return -EINVAL;
   11595           0 :                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
   11596             :                         return -EINVAL;
   11597             :         }
   11598             : 
   11599           0 :         if (!attr->sample_max_stack)
   11600           0 :                 attr->sample_max_stack = sysctl_perf_event_max_stack;
   11601             : 
   11602           0 :         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
   11603           0 :                 ret = perf_reg_validate(attr->sample_regs_intr);
   11604             : 
   11605             : #ifndef CONFIG_CGROUP_PERF
   11606           0 :         if (attr->sample_type & PERF_SAMPLE_CGROUP)
   11607             :                 return -EINVAL;
   11608             : #endif
   11609           0 :         if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
   11610             :             (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
   11611           0 :                 return -EINVAL;
   11612             : 
   11613           0 : out:
   11614             :         return ret;
   11615             : 
   11616           0 : err_size:
   11617           0 :         put_user(sizeof(*attr), &uattr->size);
   11618           0 :         ret = -E2BIG;
   11619           0 :         goto out;
   11620             : }
   11621             : 
   11622             : static int
   11623           0 : perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
   11624             : {
   11625           0 :         struct perf_buffer *rb = NULL;
   11626           0 :         int ret = -EINVAL;
   11627             : 
   11628           0 :         if (!output_event)
   11629           0 :                 goto set;
   11630             : 
   11631             :         /* don't allow circular references */
   11632           0 :         if (event == output_event)
   11633           0 :                 goto out;
   11634             : 
   11635             :         /*
   11636             :          * Don't allow cross-cpu buffers
   11637             :          */
   11638           0 :         if (output_event->cpu != event->cpu)
   11639           0 :                 goto out;
   11640             : 
   11641             :         /*
   11642             :          * If its not a per-cpu rb, it must be the same task.
   11643             :          */
   11644           0 :         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
   11645           0 :                 goto out;
   11646             : 
   11647             :         /*
   11648             :          * Mixing clocks in the same buffer is trouble you don't need.
   11649             :          */
   11650           0 :         if (output_event->clock != event->clock)
   11651           0 :                 goto out;
   11652             : 
   11653             :         /*
   11654             :          * Either writing ring buffer from beginning or from end.
   11655             :          * Mixing is not allowed.
   11656             :          */
   11657           0 :         if (is_write_backward(output_event) != is_write_backward(event))
   11658           0 :                 goto out;
   11659             : 
   11660             :         /*
   11661             :          * If both events generate aux data, they must be on the same PMU
   11662             :          */
   11663           0 :         if (has_aux(event) && has_aux(output_event) &&
   11664             :             event->pmu != output_event->pmu)
   11665           0 :                 goto out;
   11666             : 
   11667           0 : set:
   11668           0 :         mutex_lock(&event->mmap_mutex);
   11669             :         /* Can't redirect output if we've got an active mmap() */
   11670           0 :         if (atomic_read(&event->mmap_count))
   11671           0 :                 goto unlock;
   11672             : 
   11673           0 :         if (output_event) {
   11674             :                 /* get the rb we want to redirect to */
   11675           0 :                 rb = ring_buffer_get(output_event);
   11676           0 :                 if (!rb)
   11677           0 :                         goto unlock;
   11678             :         }
   11679             : 
   11680           0 :         ring_buffer_attach(event, rb);
   11681             : 
   11682           0 :         ret = 0;
   11683           0 : unlock:
   11684           0 :         mutex_unlock(&event->mmap_mutex);
   11685             : 
   11686           0 : out:
   11687           0 :         return ret;
   11688             : }
   11689             : 
   11690           0 : static void mutex_lock_double(struct mutex *a, struct mutex *b)
   11691             : {
   11692           0 :         if (b < a)
   11693           0 :                 swap(a, b);
   11694             : 
   11695           0 :         mutex_lock(a);
   11696           0 :         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
   11697           0 : }
   11698             : 
   11699           0 : static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
   11700             : {
   11701           0 :         bool nmi_safe = false;
   11702             : 
   11703           0 :         switch (clk_id) {
   11704           0 :         case CLOCK_MONOTONIC:
   11705           0 :                 event->clock = &ktime_get_mono_fast_ns;
   11706           0 :                 nmi_safe = true;
   11707           0 :                 break;
   11708             : 
   11709           0 :         case CLOCK_MONOTONIC_RAW:
   11710           0 :                 event->clock = &ktime_get_raw_fast_ns;
   11711           0 :                 nmi_safe = true;
   11712           0 :                 break;
   11713             : 
   11714           0 :         case CLOCK_REALTIME:
   11715           0 :                 event->clock = &ktime_get_real_ns;
   11716           0 :                 break;
   11717             : 
   11718           0 :         case CLOCK_BOOTTIME:
   11719           0 :                 event->clock = &ktime_get_boottime_ns;
   11720           0 :                 break;
   11721             : 
   11722           0 :         case CLOCK_TAI:
   11723           0 :                 event->clock = &ktime_get_clocktai_ns;
   11724           0 :                 break;
   11725             : 
   11726             :         default:
   11727             :                 return -EINVAL;
   11728             :         }
   11729             : 
   11730           0 :         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
   11731           0 :                 return -EINVAL;
   11732             : 
   11733             :         return 0;
   11734             : }
   11735             : 
   11736             : /*
   11737             :  * Variation on perf_event_ctx_lock_nested(), except we take two context
   11738             :  * mutexes.
   11739             :  */
   11740             : static struct perf_event_context *
   11741           0 : __perf_event_ctx_lock_double(struct perf_event *group_leader,
   11742             :                              struct perf_event_context *ctx)
   11743             : {
   11744           0 :         struct perf_event_context *gctx;
   11745             : 
   11746             : again:
   11747           0 :         rcu_read_lock();
   11748           0 :         gctx = READ_ONCE(group_leader->ctx);
   11749           0 :         if (!refcount_inc_not_zero(&gctx->refcount)) {
   11750           0 :                 rcu_read_unlock();
   11751           0 :                 goto again;
   11752             :         }
   11753           0 :         rcu_read_unlock();
   11754             : 
   11755           0 :         mutex_lock_double(&gctx->mutex, &ctx->mutex);
   11756             : 
   11757           0 :         if (group_leader->ctx != gctx) {
   11758           0 :                 mutex_unlock(&ctx->mutex);
   11759           0 :                 mutex_unlock(&gctx->mutex);
   11760           0 :                 put_ctx(gctx);
   11761           0 :                 goto again;
   11762             :         }
   11763             : 
   11764           0 :         return gctx;
   11765             : }
   11766             : 
   11767             : /**
   11768             :  * sys_perf_event_open - open a performance event, associate it to a task/cpu
   11769             :  *
   11770             :  * @attr_uptr:  event_id type attributes for monitoring/sampling
   11771             :  * @pid:                target pid
   11772             :  * @cpu:                target cpu
   11773             :  * @group_fd:           group leader event fd
   11774             :  */
   11775           0 : SYSCALL_DEFINE5(perf_event_open,
   11776             :                 struct perf_event_attr __user *, attr_uptr,
   11777             :                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
   11778             : {
   11779           0 :         struct perf_event *group_leader = NULL, *output_event = NULL;
   11780           0 :         struct perf_event *event, *sibling;
   11781           0 :         struct perf_event_attr attr;
   11782           0 :         struct perf_event_context *ctx, *gctx;
   11783           0 :         struct file *event_file = NULL;
   11784           0 :         struct fd group = {NULL, 0};
   11785           0 :         struct task_struct *task = NULL;
   11786           0 :         struct pmu *pmu;
   11787           0 :         int event_fd;
   11788           0 :         int move_group = 0;
   11789           0 :         int err;
   11790           0 :         int f_flags = O_RDWR;
   11791           0 :         int cgroup_fd = -1;
   11792             : 
   11793             :         /* for future expandability... */
   11794           0 :         if (flags & ~PERF_FLAG_ALL)
   11795             :                 return -EINVAL;
   11796             : 
   11797             :         /* Do we allow access to perf_event_open(2) ? */
   11798           0 :         err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
   11799           0 :         if (err)
   11800           0 :                 return err;
   11801             : 
   11802           0 :         err = perf_copy_attr(attr_uptr, &attr);
   11803           0 :         if (err)
   11804           0 :                 return err;
   11805             : 
   11806           0 :         if (!attr.exclude_kernel) {
   11807           0 :                 err = perf_allow_kernel(&attr);
   11808           0 :                 if (err)
   11809           0 :                         return err;
   11810             :         }
   11811             : 
   11812           0 :         if (attr.namespaces) {
   11813           0 :                 if (!perfmon_capable())
   11814             :                         return -EACCES;
   11815             :         }
   11816             : 
   11817           0 :         if (attr.freq) {
   11818           0 :                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
   11819             :                         return -EINVAL;
   11820             :         } else {
   11821           0 :                 if (attr.sample_period & (1ULL << 63))
   11822             :                         return -EINVAL;
   11823             :         }
   11824             : 
   11825             :         /* Only privileged users can get physical addresses */
   11826           0 :         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
   11827           0 :                 err = perf_allow_kernel(&attr);
   11828           0 :                 if (err)
   11829           0 :                         return err;
   11830             :         }
   11831             : 
   11832           0 :         err = security_locked_down(LOCKDOWN_PERF);
   11833           0 :         if (err && (attr.sample_type & PERF_SAMPLE_REGS_INTR))
   11834             :                 /* REGS_INTR can leak data, lockdown must prevent this */
   11835           0 :                 return err;
   11836             : 
   11837           0 :         err = 0;
   11838             : 
   11839             :         /*
   11840             :          * In cgroup mode, the pid argument is used to pass the fd
   11841             :          * opened to the cgroup directory in cgroupfs. The cpu argument
   11842             :          * designates the cpu on which to monitor threads from that
   11843             :          * cgroup.
   11844             :          */
   11845           0 :         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
   11846             :                 return -EINVAL;
   11847             : 
   11848           0 :         if (flags & PERF_FLAG_FD_CLOEXEC)
   11849           0 :                 f_flags |= O_CLOEXEC;
   11850             : 
   11851           0 :         event_fd = get_unused_fd_flags(f_flags);
   11852           0 :         if (event_fd < 0)
   11853           0 :                 return event_fd;
   11854             : 
   11855           0 :         if (group_fd != -1) {
   11856           0 :                 err = perf_fget_light(group_fd, &group);
   11857           0 :                 if (err)
   11858           0 :                         goto err_fd;
   11859           0 :                 group_leader = group.file->private_data;
   11860           0 :                 if (flags & PERF_FLAG_FD_OUTPUT)
   11861           0 :                         output_event = group_leader;
   11862           0 :                 if (flags & PERF_FLAG_FD_NO_GROUP)
   11863           0 :                         group_leader = NULL;
   11864             :         }
   11865             : 
   11866           0 :         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
   11867           0 :                 task = find_lively_task_by_vpid(pid);
   11868           0 :                 if (IS_ERR(task)) {
   11869           0 :                         err = PTR_ERR(task);
   11870           0 :                         goto err_group_fd;
   11871             :                 }
   11872             :         }
   11873             : 
   11874           0 :         if (task && group_leader &&
   11875           0 :             group_leader->attr.inherit != attr.inherit) {
   11876           0 :                 err = -EINVAL;
   11877           0 :                 goto err_task;
   11878             :         }
   11879             : 
   11880           0 :         if (flags & PERF_FLAG_PID_CGROUP)
   11881           0 :                 cgroup_fd = pid;
   11882             : 
   11883           0 :         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
   11884             :                                  NULL, NULL, cgroup_fd);
   11885           0 :         if (IS_ERR(event)) {
   11886           0 :                 err = PTR_ERR(event);
   11887           0 :                 goto err_task;
   11888             :         }
   11889             : 
   11890           0 :         if (is_sampling_event(event)) {
   11891           0 :                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
   11892           0 :                         err = -EOPNOTSUPP;
   11893           0 :                         goto err_alloc;
   11894             :                 }
   11895             :         }
   11896             : 
   11897             :         /*
   11898             :          * Special case software events and allow them to be part of
   11899             :          * any hardware group.
   11900             :          */
   11901           0 :         pmu = event->pmu;
   11902             : 
   11903           0 :         if (attr.use_clockid) {
   11904           0 :                 err = perf_event_set_clock(event, attr.clockid);
   11905           0 :                 if (err)
   11906           0 :                         goto err_alloc;
   11907             :         }
   11908             : 
   11909           0 :         if (pmu->task_ctx_nr == perf_sw_context)
   11910           0 :                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
   11911             : 
   11912           0 :         if (group_leader) {
   11913           0 :                 if (is_software_event(event) &&
   11914           0 :                     !in_software_context(group_leader)) {
   11915             :                         /*
   11916             :                          * If the event is a sw event, but the group_leader
   11917             :                          * is on hw context.
   11918             :                          *
   11919             :                          * Allow the addition of software events to hw
   11920             :                          * groups, this is safe because software events
   11921             :                          * never fail to schedule.
   11922             :                          */
   11923             :                         pmu = group_leader->ctx->pmu;
   11924           0 :                 } else if (!is_software_event(event) &&
   11925           0 :                            is_software_event(group_leader) &&
   11926           0 :                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
   11927             :                         /*
   11928             :                          * In case the group is a pure software group, and we
   11929             :                          * try to add a hardware event, move the whole group to
   11930             :                          * the hardware context.
   11931             :                          */
   11932             :                         move_group = 1;
   11933             :                 }
   11934             :         }
   11935             : 
   11936             :         /*
   11937             :          * Get the target context (task or percpu):
   11938             :          */
   11939           0 :         ctx = find_get_context(pmu, task, event);
   11940           0 :         if (IS_ERR(ctx)) {
   11941           0 :                 err = PTR_ERR(ctx);
   11942           0 :                 goto err_alloc;
   11943             :         }
   11944             : 
   11945             :         /*
   11946             :          * Look up the group leader (we will attach this event to it):
   11947             :          */
   11948           0 :         if (group_leader) {
   11949           0 :                 err = -EINVAL;
   11950             : 
   11951             :                 /*
   11952             :                  * Do not allow a recursive hierarchy (this new sibling
   11953             :                  * becoming part of another group-sibling):
   11954             :                  */
   11955           0 :                 if (group_leader->group_leader != group_leader)
   11956           0 :                         goto err_context;
   11957             : 
   11958             :                 /* All events in a group should have the same clock */
   11959           0 :                 if (group_leader->clock != event->clock)
   11960           0 :                         goto err_context;
   11961             : 
   11962             :                 /*
   11963             :                  * Make sure we're both events for the same CPU;
   11964             :                  * grouping events for different CPUs is broken; since
   11965             :                  * you can never concurrently schedule them anyhow.
   11966             :                  */
   11967           0 :                 if (group_leader->cpu != event->cpu)
   11968           0 :                         goto err_context;
   11969             : 
   11970             :                 /*
   11971             :                  * Make sure we're both on the same task, or both
   11972             :                  * per-CPU events.
   11973             :                  */
   11974           0 :                 if (group_leader->ctx->task != ctx->task)
   11975           0 :                         goto err_context;
   11976             : 
   11977             :                 /*
   11978             :                  * Do not allow to attach to a group in a different task
   11979             :                  * or CPU context. If we're moving SW events, we'll fix
   11980             :                  * this up later, so allow that.
   11981             :                  */
   11982           0 :                 if (!move_group && group_leader->ctx != ctx)
   11983           0 :                         goto err_context;
   11984             : 
   11985             :                 /*
   11986             :                  * Only a group leader can be exclusive or pinned
   11987             :                  */
   11988           0 :                 if (attr.exclusive || attr.pinned)
   11989           0 :                         goto err_context;
   11990             :         }
   11991             : 
   11992           0 :         if (output_event) {
   11993           0 :                 err = perf_event_set_output(event, output_event);
   11994           0 :                 if (err)
   11995           0 :                         goto err_context;
   11996             :         }
   11997             : 
   11998           0 :         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
   11999             :                                         f_flags);
   12000           0 :         if (IS_ERR(event_file)) {
   12001           0 :                 err = PTR_ERR(event_file);
   12002           0 :                 event_file = NULL;
   12003           0 :                 goto err_context;
   12004             :         }
   12005             : 
   12006           0 :         if (task) {
   12007           0 :                 err = down_read_interruptible(&task->signal->exec_update_lock);
   12008           0 :                 if (err)
   12009           0 :                         goto err_file;
   12010             : 
   12011             :                 /*
   12012             :                  * Preserve ptrace permission check for backwards compatibility.
   12013             :                  *
   12014             :                  * We must hold exec_update_lock across this and any potential
   12015             :                  * perf_install_in_context() call for this new event to
   12016             :                  * serialize against exec() altering our credentials (and the
   12017             :                  * perf_event_exit_task() that could imply).
   12018             :                  */
   12019           0 :                 err = -EACCES;
   12020           0 :                 if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
   12021           0 :                         goto err_cred;
   12022             :         }
   12023             : 
   12024           0 :         if (move_group) {
   12025           0 :                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
   12026             : 
   12027           0 :                 if (gctx->task == TASK_TOMBSTONE) {
   12028           0 :                         err = -ESRCH;
   12029           0 :                         goto err_locked;
   12030             :                 }
   12031             : 
   12032             :                 /*
   12033             :                  * Check if we raced against another sys_perf_event_open() call
   12034             :                  * moving the software group underneath us.
   12035             :                  */
   12036           0 :                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
   12037             :                         /*
   12038             :                          * If someone moved the group out from under us, check
   12039             :                          * if this new event wound up on the same ctx, if so
   12040             :                          * its the regular !move_group case, otherwise fail.
   12041             :                          */
   12042           0 :                         if (gctx != ctx) {
   12043           0 :                                 err = -EINVAL;
   12044           0 :                                 goto err_locked;
   12045             :                         } else {
   12046           0 :                                 perf_event_ctx_unlock(group_leader, gctx);
   12047           0 :                                 move_group = 0;
   12048             :                         }
   12049             :                 }
   12050             : 
   12051             :                 /*
   12052             :                  * Failure to create exclusive events returns -EBUSY.
   12053             :                  */
   12054           0 :                 err = -EBUSY;
   12055           0 :                 if (!exclusive_event_installable(group_leader, ctx))
   12056           0 :                         goto err_locked;
   12057             : 
   12058           0 :                 for_each_sibling_event(sibling, group_leader) {
   12059           0 :                         if (!exclusive_event_installable(sibling, ctx))
   12060           0 :                                 goto err_locked;
   12061             :                 }
   12062             :         } else {
   12063           0 :                 mutex_lock(&ctx->mutex);
   12064             :         }
   12065             : 
   12066           0 :         if (ctx->task == TASK_TOMBSTONE) {
   12067           0 :                 err = -ESRCH;
   12068           0 :                 goto err_locked;
   12069             :         }
   12070             : 
   12071           0 :         if (!perf_event_validate_size(event)) {
   12072           0 :                 err = -E2BIG;
   12073           0 :                 goto err_locked;
   12074             :         }
   12075             : 
   12076           0 :         if (!task) {
   12077             :                 /*
   12078             :                  * Check if the @cpu we're creating an event for is online.
   12079             :                  *
   12080             :                  * We use the perf_cpu_context::ctx::mutex to serialize against
   12081             :                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
   12082             :                  */
   12083           0 :                 struct perf_cpu_context *cpuctx =
   12084           0 :                         container_of(ctx, struct perf_cpu_context, ctx);
   12085             : 
   12086           0 :                 if (!cpuctx->online) {
   12087           0 :                         err = -ENODEV;
   12088           0 :                         goto err_locked;
   12089             :                 }
   12090             :         }
   12091             : 
   12092           0 :         if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
   12093           0 :                 err = -EINVAL;
   12094           0 :                 goto err_locked;
   12095             :         }
   12096             : 
   12097             :         /*
   12098             :          * Must be under the same ctx::mutex as perf_install_in_context(),
   12099             :          * because we need to serialize with concurrent event creation.
   12100             :          */
   12101           0 :         if (!exclusive_event_installable(event, ctx)) {
   12102           0 :                 err = -EBUSY;
   12103           0 :                 goto err_locked;
   12104             :         }
   12105             : 
   12106           0 :         WARN_ON_ONCE(ctx->parent_ctx);
   12107             : 
   12108             :         /*
   12109             :          * This is the point on no return; we cannot fail hereafter. This is
   12110             :          * where we start modifying current state.
   12111             :          */
   12112             : 
   12113           0 :         if (move_group) {
   12114             :                 /*
   12115             :                  * See perf_event_ctx_lock() for comments on the details
   12116             :                  * of swizzling perf_event::ctx.
   12117             :                  */
   12118           0 :                 perf_remove_from_context(group_leader, 0);
   12119           0 :                 put_ctx(gctx);
   12120             : 
   12121           0 :                 for_each_sibling_event(sibling, group_leader) {
   12122           0 :                         perf_remove_from_context(sibling, 0);
   12123           0 :                         put_ctx(gctx);
   12124             :                 }
   12125             : 
   12126             :                 /*
   12127             :                  * Wait for everybody to stop referencing the events through
   12128             :                  * the old lists, before installing it on new lists.
   12129             :                  */
   12130           0 :                 synchronize_rcu();
   12131             : 
   12132             :                 /*
   12133             :                  * Install the group siblings before the group leader.
   12134             :                  *
   12135             :                  * Because a group leader will try and install the entire group
   12136             :                  * (through the sibling list, which is still in-tact), we can
   12137             :                  * end up with siblings installed in the wrong context.
   12138             :                  *
   12139             :                  * By installing siblings first we NO-OP because they're not
   12140             :                  * reachable through the group lists.
   12141             :                  */
   12142           0 :                 for_each_sibling_event(sibling, group_leader) {
   12143           0 :                         perf_event__state_init(sibling);
   12144           0 :                         perf_install_in_context(ctx, sibling, sibling->cpu);
   12145           0 :                         get_ctx(ctx);
   12146             :                 }
   12147             : 
   12148             :                 /*
   12149             :                  * Removing from the context ends up with disabled
   12150             :                  * event. What we want here is event in the initial
   12151             :                  * startup state, ready to be add into new context.
   12152             :                  */
   12153           0 :                 perf_event__state_init(group_leader);
   12154           0 :                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
   12155           0 :                 get_ctx(ctx);
   12156             :         }
   12157             : 
   12158             :         /*
   12159             :          * Precalculate sample_data sizes; do while holding ctx::mutex such
   12160             :          * that we're serialized against further additions and before
   12161             :          * perf_install_in_context() which is the point the event is active and
   12162             :          * can use these values.
   12163             :          */
   12164           0 :         perf_event__header_size(event);
   12165           0 :         perf_event__id_header_size(event);
   12166             : 
   12167           0 :         event->owner = current;
   12168             : 
   12169           0 :         perf_install_in_context(ctx, event, event->cpu);
   12170           0 :         perf_unpin_context(ctx);
   12171             : 
   12172           0 :         if (move_group)
   12173           0 :                 perf_event_ctx_unlock(group_leader, gctx);
   12174           0 :         mutex_unlock(&ctx->mutex);
   12175             : 
   12176           0 :         if (task) {
   12177           0 :                 up_read(&task->signal->exec_update_lock);
   12178           0 :                 put_task_struct(task);
   12179             :         }
   12180             : 
   12181           0 :         mutex_lock(&current->perf_event_mutex);
   12182           0 :         list_add_tail(&event->owner_entry, &current->perf_event_list);
   12183           0 :         mutex_unlock(&current->perf_event_mutex);
   12184             : 
   12185             :         /*
   12186             :          * Drop the reference on the group_event after placing the
   12187             :          * new event on the sibling_list. This ensures destruction
   12188             :          * of the group leader will find the pointer to itself in
   12189             :          * perf_group_detach().
   12190             :          */
   12191           0 :         fdput(group);
   12192           0 :         fd_install(event_fd, event_file);
   12193           0 :         return event_fd;
   12194             : 
   12195           0 : err_locked:
   12196           0 :         if (move_group)
   12197           0 :                 perf_event_ctx_unlock(group_leader, gctx);
   12198           0 :         mutex_unlock(&ctx->mutex);
   12199           0 : err_cred:
   12200           0 :         if (task)
   12201           0 :                 up_read(&task->signal->exec_update_lock);
   12202           0 : err_file:
   12203           0 :         fput(event_file);
   12204           0 : err_context:
   12205           0 :         perf_unpin_context(ctx);
   12206           0 :         put_ctx(ctx);
   12207           0 : err_alloc:
   12208             :         /*
   12209             :          * If event_file is set, the fput() above will have called ->release()
   12210             :          * and that will take care of freeing the event.
   12211             :          */
   12212           0 :         if (!event_file)
   12213           0 :                 free_event(event);
   12214           0 : err_task:
   12215           0 :         if (task)
   12216           0 :                 put_task_struct(task);
   12217           0 : err_group_fd:
   12218           0 :         fdput(group);
   12219           0 : err_fd:
   12220           0 :         put_unused_fd(event_fd);
   12221           0 :         return err;
   12222             : }
   12223             : 
   12224             : /**
   12225             :  * perf_event_create_kernel_counter
   12226             :  *
   12227             :  * @attr: attributes of the counter to create
   12228             :  * @cpu: cpu in which the counter is bound
   12229             :  * @task: task to profile (NULL for percpu)
   12230             :  */
   12231             : struct perf_event *
   12232           0 : perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
   12233             :                                  struct task_struct *task,
   12234             :                                  perf_overflow_handler_t overflow_handler,
   12235             :                                  void *context)
   12236             : {
   12237           0 :         struct perf_event_context *ctx;
   12238           0 :         struct perf_event *event;
   12239           0 :         int err;
   12240             : 
   12241             :         /*
   12242             :          * Grouping is not supported for kernel events, neither is 'AUX',
   12243             :          * make sure the caller's intentions are adjusted.
   12244             :          */
   12245           0 :         if (attr->aux_output)
   12246           0 :                 return ERR_PTR(-EINVAL);
   12247             : 
   12248           0 :         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
   12249             :                                  overflow_handler, context, -1);
   12250           0 :         if (IS_ERR(event)) {
   12251           0 :                 err = PTR_ERR(event);
   12252           0 :                 goto err;
   12253             :         }
   12254             : 
   12255             :         /* Mark owner so we could distinguish it from user events. */
   12256           0 :         event->owner = TASK_TOMBSTONE;
   12257             : 
   12258             :         /*
   12259             :          * Get the target context (task or percpu):
   12260             :          */
   12261           0 :         ctx = find_get_context(event->pmu, task, event);
   12262           0 :         if (IS_ERR(ctx)) {
   12263           0 :                 err = PTR_ERR(ctx);
   12264           0 :                 goto err_free;
   12265             :         }
   12266             : 
   12267           0 :         WARN_ON_ONCE(ctx->parent_ctx);
   12268           0 :         mutex_lock(&ctx->mutex);
   12269           0 :         if (ctx->task == TASK_TOMBSTONE) {
   12270           0 :                 err = -ESRCH;
   12271           0 :                 goto err_unlock;
   12272             :         }
   12273             : 
   12274           0 :         if (!task) {
   12275             :                 /*
   12276             :                  * Check if the @cpu we're creating an event for is online.
   12277             :                  *
   12278             :                  * We use the perf_cpu_context::ctx::mutex to serialize against
   12279             :                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
   12280             :                  */
   12281           0 :                 struct perf_cpu_context *cpuctx =
   12282           0 :                         container_of(ctx, struct perf_cpu_context, ctx);
   12283           0 :                 if (!cpuctx->online) {
   12284           0 :                         err = -ENODEV;
   12285           0 :                         goto err_unlock;
   12286             :                 }
   12287             :         }
   12288             : 
   12289           0 :         if (!exclusive_event_installable(event, ctx)) {
   12290           0 :                 err = -EBUSY;
   12291           0 :                 goto err_unlock;
   12292             :         }
   12293             : 
   12294           0 :         perf_install_in_context(ctx, event, event->cpu);
   12295           0 :         perf_unpin_context(ctx);
   12296           0 :         mutex_unlock(&ctx->mutex);
   12297             : 
   12298           0 :         return event;
   12299             : 
   12300           0 : err_unlock:
   12301           0 :         mutex_unlock(&ctx->mutex);
   12302           0 :         perf_unpin_context(ctx);
   12303           0 :         put_ctx(ctx);
   12304           0 : err_free:
   12305           0 :         free_event(event);
   12306           0 : err:
   12307           0 :         return ERR_PTR(err);
   12308             : }
   12309             : EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
   12310             : 
   12311           0 : void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
   12312             : {
   12313           0 :         struct perf_event_context *src_ctx;
   12314           0 :         struct perf_event_context *dst_ctx;
   12315           0 :         struct perf_event *event, *tmp;
   12316           0 :         LIST_HEAD(events);
   12317             : 
   12318           0 :         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
   12319           0 :         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
   12320             : 
   12321             :         /*
   12322             :          * See perf_event_ctx_lock() for comments on the details
   12323             :          * of swizzling perf_event::ctx.
   12324             :          */
   12325           0 :         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
   12326           0 :         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
   12327             :                                  event_entry) {
   12328           0 :                 perf_remove_from_context(event, 0);
   12329           0 :                 unaccount_event_cpu(event, src_cpu);
   12330           0 :                 put_ctx(src_ctx);
   12331           0 :                 list_add(&event->migrate_entry, &events);
   12332             :         }
   12333             : 
   12334             :         /*
   12335             :          * Wait for the events to quiesce before re-instating them.
   12336             :          */
   12337           0 :         synchronize_rcu();
   12338             : 
   12339             :         /*
   12340             :          * Re-instate events in 2 passes.
   12341             :          *
   12342             :          * Skip over group leaders and only install siblings on this first
   12343             :          * pass, siblings will not get enabled without a leader, however a
   12344             :          * leader will enable its siblings, even if those are still on the old
   12345             :          * context.
   12346             :          */
   12347           0 :         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
   12348           0 :                 if (event->group_leader == event)
   12349           0 :                         continue;
   12350             : 
   12351           0 :                 list_del(&event->migrate_entry);
   12352           0 :                 if (event->state >= PERF_EVENT_STATE_OFF)
   12353           0 :                         event->state = PERF_EVENT_STATE_INACTIVE;
   12354           0 :                 account_event_cpu(event, dst_cpu);
   12355           0 :                 perf_install_in_context(dst_ctx, event, dst_cpu);
   12356           0 :                 get_ctx(dst_ctx);
   12357             :         }
   12358             : 
   12359             :         /*
   12360             :          * Once all the siblings are setup properly, install the group leaders
   12361             :          * to make it go.
   12362             :          */
   12363           0 :         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
   12364           0 :                 list_del(&event->migrate_entry);
   12365           0 :                 if (event->state >= PERF_EVENT_STATE_OFF)
   12366           0 :                         event->state = PERF_EVENT_STATE_INACTIVE;
   12367           0 :                 account_event_cpu(event, dst_cpu);
   12368           0 :                 perf_install_in_context(dst_ctx, event, dst_cpu);
   12369           0 :                 get_ctx(dst_ctx);
   12370             :         }
   12371           0 :         mutex_unlock(&dst_ctx->mutex);
   12372           0 :         mutex_unlock(&src_ctx->mutex);
   12373           0 : }
   12374             : EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
   12375             : 
   12376           0 : static void sync_child_event(struct perf_event *child_event,
   12377             :                                struct task_struct *child)
   12378             : {
   12379           0 :         struct perf_event *parent_event = child_event->parent;
   12380           0 :         u64 child_val;
   12381             : 
   12382           0 :         if (child_event->attr.inherit_stat)
   12383           0 :                 perf_event_read_event(child_event, child);
   12384             : 
   12385           0 :         child_val = perf_event_count(child_event);
   12386             : 
   12387             :         /*
   12388             :          * Add back the child's count to the parent's count:
   12389             :          */
   12390           0 :         atomic64_add(child_val, &parent_event->child_count);
   12391           0 :         atomic64_add(child_event->total_time_enabled,
   12392             :                      &parent_event->child_total_time_enabled);
   12393           0 :         atomic64_add(child_event->total_time_running,
   12394             :                      &parent_event->child_total_time_running);
   12395           0 : }
   12396             : 
   12397             : static void
   12398           0 : perf_event_exit_event(struct perf_event *child_event,
   12399             :                       struct perf_event_context *child_ctx,
   12400             :                       struct task_struct *child)
   12401             : {
   12402           0 :         struct perf_event *parent_event = child_event->parent;
   12403             : 
   12404             :         /*
   12405             :          * Do not destroy the 'original' grouping; because of the context
   12406             :          * switch optimization the original events could've ended up in a
   12407             :          * random child task.
   12408             :          *
   12409             :          * If we were to destroy the original group, all group related
   12410             :          * operations would cease to function properly after this random
   12411             :          * child dies.
   12412             :          *
   12413             :          * Do destroy all inherited groups, we don't care about those
   12414             :          * and being thorough is better.
   12415             :          */
   12416           0 :         raw_spin_lock_irq(&child_ctx->lock);
   12417           0 :         WARN_ON_ONCE(child_ctx->is_active);
   12418             : 
   12419           0 :         if (parent_event)
   12420           0 :                 perf_group_detach(child_event);
   12421           0 :         list_del_event(child_event, child_ctx);
   12422           0 :         perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
   12423           0 :         raw_spin_unlock_irq(&child_ctx->lock);
   12424             : 
   12425             :         /*
   12426             :          * Parent events are governed by their filedesc, retain them.
   12427             :          */
   12428           0 :         if (!parent_event) {
   12429           0 :                 perf_event_wakeup(child_event);
   12430           0 :                 return;
   12431             :         }
   12432             :         /*
   12433             :          * Child events can be cleaned up.
   12434             :          */
   12435             : 
   12436           0 :         sync_child_event(child_event, child);
   12437             : 
   12438             :         /*
   12439             :          * Remove this event from the parent's list
   12440             :          */
   12441           0 :         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
   12442           0 :         mutex_lock(&parent_event->child_mutex);
   12443           0 :         list_del_init(&child_event->child_list);
   12444           0 :         mutex_unlock(&parent_event->child_mutex);
   12445             : 
   12446             :         /*
   12447             :          * Kick perf_poll() for is_event_hup().
   12448             :          */
   12449           0 :         perf_event_wakeup(parent_event);
   12450           0 :         free_event(child_event);
   12451           0 :         put_event(parent_event);
   12452             : }
   12453             : 
   12454        2310 : static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
   12455             : {
   12456        2310 :         struct perf_event_context *child_ctx, *clone_ctx = NULL;
   12457        2310 :         struct perf_event *child_event, *next;
   12458             : 
   12459        2310 :         WARN_ON_ONCE(child != current);
   12460             : 
   12461        2310 :         child_ctx = perf_pin_task_context(child, ctxn);
   12462        2310 :         if (!child_ctx)
   12463             :                 return;
   12464             : 
   12465             :         /*
   12466             :          * In order to reduce the amount of tricky in ctx tear-down, we hold
   12467             :          * ctx::mutex over the entire thing. This serializes against almost
   12468             :          * everything that wants to access the ctx.
   12469             :          *
   12470             :          * The exception is sys_perf_event_open() /
   12471             :          * perf_event_create_kernel_count() which does find_get_context()
   12472             :          * without ctx::mutex (it cannot because of the move_group double mutex
   12473             :          * lock thing). See the comments in perf_install_in_context().
   12474             :          */
   12475           0 :         mutex_lock(&child_ctx->mutex);
   12476             : 
   12477             :         /*
   12478             :          * In a single ctx::lock section, de-schedule the events and detach the
   12479             :          * context from the task such that we cannot ever get it scheduled back
   12480             :          * in.
   12481             :          */
   12482           0 :         raw_spin_lock_irq(&child_ctx->lock);
   12483           0 :         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
   12484             : 
   12485             :         /*
   12486             :          * Now that the context is inactive, destroy the task <-> ctx relation
   12487             :          * and mark the context dead.
   12488             :          */
   12489           0 :         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
   12490           0 :         put_ctx(child_ctx); /* cannot be last */
   12491           0 :         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
   12492           0 :         put_task_struct(current); /* cannot be last */
   12493             : 
   12494           0 :         clone_ctx = unclone_ctx(child_ctx);
   12495           0 :         raw_spin_unlock_irq(&child_ctx->lock);
   12496             : 
   12497           0 :         if (clone_ctx)
   12498           0 :                 put_ctx(clone_ctx);
   12499             : 
   12500             :         /*
   12501             :          * Report the task dead after unscheduling the events so that we
   12502             :          * won't get any samples after PERF_RECORD_EXIT. We can however still
   12503             :          * get a few PERF_RECORD_READ events.
   12504             :          */
   12505           0 :         perf_event_task(child, child_ctx, 0);
   12506             : 
   12507           0 :         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
   12508           0 :                 perf_event_exit_event(child_event, child_ctx, child);
   12509             : 
   12510           0 :         mutex_unlock(&child_ctx->mutex);
   12511             : 
   12512           0 :         put_ctx(child_ctx);
   12513             : }
   12514             : 
   12515             : /*
   12516             :  * When a child task exits, feed back event values to parent events.
   12517             :  *
   12518             :  * Can be called with exec_update_lock held when called from
   12519             :  * setup_new_exec().
   12520             :  */
   12521        1155 : void perf_event_exit_task(struct task_struct *child)
   12522             : {
   12523        1155 :         struct perf_event *event, *tmp;
   12524        1155 :         int ctxn;
   12525             : 
   12526        1155 :         mutex_lock(&child->perf_event_mutex);
   12527        1155 :         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
   12528             :                                  owner_entry) {
   12529           0 :                 list_del_init(&event->owner_entry);
   12530             : 
   12531             :                 /*
   12532             :                  * Ensure the list deletion is visible before we clear
   12533             :                  * the owner, closes a race against perf_release() where
   12534             :                  * we need to serialize on the owner->perf_event_mutex.
   12535             :                  */
   12536           0 :                 smp_store_release(&event->owner, NULL);
   12537             :         }
   12538        1155 :         mutex_unlock(&child->perf_event_mutex);
   12539             : 
   12540        4620 :         for_each_task_context_nr(ctxn)
   12541        2310 :                 perf_event_exit_task_context(child, ctxn);
   12542             : 
   12543             :         /*
   12544             :          * The perf_event_exit_task_context calls perf_event_task
   12545             :          * with child's task_ctx, which generates EXIT events for
   12546             :          * child contexts and sets child->perf_event_ctxp[] to NULL.
   12547             :          * At this point we need to send EXIT events to cpu contexts.
   12548             :          */
   12549        1155 :         perf_event_task(child, NULL, 0);
   12550        1155 : }
   12551             : 
   12552           0 : static void perf_free_event(struct perf_event *event,
   12553             :                             struct perf_event_context *ctx)
   12554             : {
   12555           0 :         struct perf_event *parent = event->parent;
   12556             : 
   12557           0 :         if (WARN_ON_ONCE(!parent))
   12558             :                 return;
   12559             : 
   12560           0 :         mutex_lock(&parent->child_mutex);
   12561           0 :         list_del_init(&event->child_list);
   12562           0 :         mutex_unlock(&parent->child_mutex);
   12563             : 
   12564           0 :         put_event(parent);
   12565             : 
   12566           0 :         raw_spin_lock_irq(&ctx->lock);
   12567           0 :         perf_group_detach(event);
   12568           0 :         list_del_event(event, ctx);
   12569           0 :         raw_spin_unlock_irq(&ctx->lock);
   12570           0 :         free_event(event);
   12571             : }
   12572             : 
   12573             : /*
   12574             :  * Free a context as created by inheritance by perf_event_init_task() below,
   12575             :  * used by fork() in case of fail.
   12576             :  *
   12577             :  * Even though the task has never lived, the context and events have been
   12578             :  * exposed through the child_list, so we must take care tearing it all down.
   12579             :  */
   12580           0 : void perf_event_free_task(struct task_struct *task)
   12581             : {
   12582           0 :         struct perf_event_context *ctx;
   12583           0 :         struct perf_event *event, *tmp;
   12584           0 :         int ctxn;
   12585             : 
   12586           0 :         for_each_task_context_nr(ctxn) {
   12587           0 :                 ctx = task->perf_event_ctxp[ctxn];
   12588           0 :                 if (!ctx)
   12589           0 :                         continue;
   12590             : 
   12591           0 :                 mutex_lock(&ctx->mutex);
   12592           0 :                 raw_spin_lock_irq(&ctx->lock);
   12593             :                 /*
   12594             :                  * Destroy the task <-> ctx relation and mark the context dead.
   12595             :                  *
   12596             :                  * This is important because even though the task hasn't been
   12597             :                  * exposed yet the context has been (through child_list).
   12598             :                  */
   12599           0 :                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
   12600           0 :                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
   12601           0 :                 put_task_struct(task); /* cannot be last */
   12602           0 :                 raw_spin_unlock_irq(&ctx->lock);
   12603             : 
   12604           0 :                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
   12605           0 :                         perf_free_event(event, ctx);
   12606             : 
   12607           0 :                 mutex_unlock(&ctx->mutex);
   12608             : 
   12609             :                 /*
   12610             :                  * perf_event_release_kernel() could've stolen some of our
   12611             :                  * child events and still have them on its free_list. In that
   12612             :                  * case we must wait for these events to have been freed (in
   12613             :                  * particular all their references to this task must've been
   12614             :                  * dropped).
   12615             :                  *
   12616             :                  * Without this copy_process() will unconditionally free this
   12617             :                  * task (irrespective of its reference count) and
   12618             :                  * _free_event()'s put_task_struct(event->hw.target) will be a
   12619             :                  * use-after-free.
   12620             :                  *
   12621             :                  * Wait for all events to drop their context reference.
   12622             :                  */
   12623           0 :                 wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
   12624           0 :                 put_ctx(ctx); /* must be last */
   12625             :         }
   12626           0 : }
   12627             : 
   12628        1153 : void perf_event_delayed_put(struct task_struct *task)
   12629             : {
   12630        1153 :         int ctxn;
   12631             : 
   12632        3459 :         for_each_task_context_nr(ctxn)
   12633        2306 :                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
   12634        1153 : }
   12635             : 
   12636           0 : struct file *perf_event_get(unsigned int fd)
   12637             : {
   12638           0 :         struct file *file = fget(fd);
   12639           0 :         if (!file)
   12640           0 :                 return ERR_PTR(-EBADF);
   12641             : 
   12642           0 :         if (file->f_op != &perf_fops) {
   12643           0 :                 fput(file);
   12644           0 :                 return ERR_PTR(-EBADF);
   12645             :         }
   12646             : 
   12647             :         return file;
   12648             : }
   12649             : 
   12650           0 : const struct perf_event *perf_get_event(struct file *file)
   12651             : {
   12652           0 :         if (file->f_op != &perf_fops)
   12653           0 :                 return ERR_PTR(-EINVAL);
   12654             : 
   12655           0 :         return file->private_data;
   12656             : }
   12657             : 
   12658           0 : const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
   12659             : {
   12660           0 :         if (!event)
   12661           0 :                 return ERR_PTR(-EINVAL);
   12662             : 
   12663           0 :         return &event->attr;
   12664             : }
   12665             : 
   12666             : /*
   12667             :  * Inherit an event from parent task to child task.
   12668             :  *
   12669             :  * Returns:
   12670             :  *  - valid pointer on success
   12671             :  *  - NULL for orphaned events
   12672             :  *  - IS_ERR() on error
   12673             :  */
   12674             : static struct perf_event *
   12675           0 : inherit_event(struct perf_event *parent_event,
   12676             :               struct task_struct *parent,
   12677             :               struct perf_event_context *parent_ctx,
   12678             :               struct task_struct *child,
   12679             :               struct perf_event *group_leader,
   12680             :               struct perf_event_context *child_ctx)
   12681             : {
   12682           0 :         enum perf_event_state parent_state = parent_event->state;
   12683           0 :         struct perf_event *child_event;
   12684           0 :         unsigned long flags;
   12685             : 
   12686             :         /*
   12687             :          * Instead of creating recursive hierarchies of events,
   12688             :          * we link inherited events back to the original parent,
   12689             :          * which has a filp for sure, which we use as the reference
   12690             :          * count:
   12691             :          */
   12692           0 :         if (parent_event->parent)
   12693           0 :                 parent_event = parent_event->parent;
   12694             : 
   12695           0 :         child_event = perf_event_alloc(&parent_event->attr,
   12696             :                                            parent_event->cpu,
   12697             :                                            child,
   12698             :                                            group_leader, parent_event,
   12699             :                                            NULL, NULL, -1);
   12700           0 :         if (IS_ERR(child_event))
   12701             :                 return child_event;
   12702             : 
   12703             : 
   12704           0 :         if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
   12705           0 :             !child_ctx->task_ctx_data) {
   12706           0 :                 struct pmu *pmu = child_event->pmu;
   12707             : 
   12708           0 :                 child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
   12709           0 :                 if (!child_ctx->task_ctx_data) {
   12710           0 :                         free_event(child_event);
   12711           0 :                         return ERR_PTR(-ENOMEM);
   12712             :                 }
   12713             :         }
   12714             : 
   12715             :         /*
   12716             :          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
   12717             :          * must be under the same lock in order to serialize against
   12718             :          * perf_event_release_kernel(), such that either we must observe
   12719             :          * is_orphaned_event() or they will observe us on the child_list.
   12720             :          */
   12721           0 :         mutex_lock(&parent_event->child_mutex);
   12722           0 :         if (is_orphaned_event(parent_event) ||
   12723           0 :             !atomic_long_inc_not_zero(&parent_event->refcount)) {
   12724           0 :                 mutex_unlock(&parent_event->child_mutex);
   12725             :                 /* task_ctx_data is freed with child_ctx */
   12726           0 :                 free_event(child_event);
   12727           0 :                 return NULL;
   12728             :         }
   12729             : 
   12730           0 :         get_ctx(child_ctx);
   12731             : 
   12732             :         /*
   12733             :          * Make the child state follow the state of the parent event,
   12734             :          * not its attr.disabled bit.  We hold the parent's mutex,
   12735             :          * so we won't race with perf_event_{en, dis}able_family.
   12736             :          */
   12737           0 :         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
   12738           0 :                 child_event->state = PERF_EVENT_STATE_INACTIVE;
   12739             :         else
   12740           0 :                 child_event->state = PERF_EVENT_STATE_OFF;
   12741             : 
   12742           0 :         if (parent_event->attr.freq) {
   12743           0 :                 u64 sample_period = parent_event->hw.sample_period;
   12744           0 :                 struct hw_perf_event *hwc = &child_event->hw;
   12745             : 
   12746           0 :                 hwc->sample_period = sample_period;
   12747           0 :                 hwc->last_period   = sample_period;
   12748             : 
   12749           0 :                 local64_set(&hwc->period_left, sample_period);
   12750             :         }
   12751             : 
   12752           0 :         child_event->ctx = child_ctx;
   12753           0 :         child_event->overflow_handler = parent_event->overflow_handler;
   12754           0 :         child_event->overflow_handler_context
   12755           0 :                 = parent_event->overflow_handler_context;
   12756             : 
   12757             :         /*
   12758             :          * Precalculate sample_data sizes
   12759             :          */
   12760           0 :         perf_event__header_size(child_event);
   12761           0 :         perf_event__id_header_size(child_event);
   12762             : 
   12763             :         /*
   12764             :          * Link it up in the child's context:
   12765             :          */
   12766           0 :         raw_spin_lock_irqsave(&child_ctx->lock, flags);
   12767           0 :         add_event_to_ctx(child_event, child_ctx);
   12768           0 :         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
   12769             : 
   12770             :         /*
   12771             :          * Link this into the parent event's child list
   12772             :          */
   12773           0 :         list_add_tail(&child_event->child_list, &parent_event->child_list);
   12774           0 :         mutex_unlock(&parent_event->child_mutex);
   12775             : 
   12776           0 :         return child_event;
   12777             : }
   12778             : 
   12779             : /*
   12780             :  * Inherits an event group.
   12781             :  *
   12782             :  * This will quietly suppress orphaned events; !inherit_event() is not an error.
   12783             :  * This matches with perf_event_release_kernel() removing all child events.
   12784             :  *
   12785             :  * Returns:
   12786             :  *  - 0 on success
   12787             :  *  - <0 on error
   12788             :  */
   12789           0 : static int inherit_group(struct perf_event *parent_event,
   12790             :               struct task_struct *parent,
   12791             :               struct perf_event_context *parent_ctx,
   12792             :               struct task_struct *child,
   12793             :               struct perf_event_context *child_ctx)
   12794             : {
   12795           0 :         struct perf_event *leader;
   12796           0 :         struct perf_event *sub;
   12797           0 :         struct perf_event *child_ctr;
   12798             : 
   12799           0 :         leader = inherit_event(parent_event, parent, parent_ctx,
   12800             :                                  child, NULL, child_ctx);
   12801           0 :         if (IS_ERR(leader))
   12802           0 :                 return PTR_ERR(leader);
   12803             :         /*
   12804             :          * @leader can be NULL here because of is_orphaned_event(). In this
   12805             :          * case inherit_event() will create individual events, similar to what
   12806             :          * perf_group_detach() would do anyway.
   12807             :          */
   12808           0 :         for_each_sibling_event(sub, parent_event) {
   12809           0 :                 child_ctr = inherit_event(sub, parent, parent_ctx,
   12810             :                                             child, leader, child_ctx);
   12811           0 :                 if (IS_ERR(child_ctr))
   12812           0 :                         return PTR_ERR(child_ctr);
   12813             : 
   12814           0 :                 if (sub->aux_event == parent_event && child_ctr &&
   12815           0 :                     !perf_get_aux_event(child_ctr, leader))
   12816             :                         return -EINVAL;
   12817             :         }
   12818             :         return 0;
   12819             : }
   12820             : 
   12821             : /*
   12822             :  * Creates the child task context and tries to inherit the event-group.
   12823             :  *
   12824             :  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
   12825             :  * inherited_all set when we 'fail' to inherit an orphaned event; this is
   12826             :  * consistent with perf_event_release_kernel() removing all child events.
   12827             :  *
   12828             :  * Returns:
   12829             :  *  - 0 on success
   12830             :  *  - <0 on error
   12831             :  */
   12832             : static int
   12833           0 : inherit_task_group(struct perf_event *event, struct task_struct *parent,
   12834             :                    struct perf_event_context *parent_ctx,
   12835             :                    struct task_struct *child, int ctxn,
   12836             :                    int *inherited_all)
   12837             : {
   12838           0 :         int ret;
   12839           0 :         struct perf_event_context *child_ctx;
   12840             : 
   12841           0 :         if (!event->attr.inherit) {
   12842           0 :                 *inherited_all = 0;
   12843           0 :                 return 0;
   12844             :         }
   12845             : 
   12846           0 :         child_ctx = child->perf_event_ctxp[ctxn];
   12847           0 :         if (!child_ctx) {
   12848             :                 /*
   12849             :                  * This is executed from the parent task context, so
   12850             :                  * inherit events that have been marked for cloning.
   12851             :                  * First allocate and initialize a context for the
   12852             :                  * child.
   12853             :                  */
   12854           0 :                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
   12855           0 :                 if (!child_ctx)
   12856             :                         return -ENOMEM;
   12857             : 
   12858           0 :                 child->perf_event_ctxp[ctxn] = child_ctx;
   12859             :         }
   12860             : 
   12861           0 :         ret = inherit_group(event, parent, parent_ctx,
   12862             :                             child, child_ctx);
   12863             : 
   12864           0 :         if (ret)
   12865           0 :                 *inherited_all = 0;
   12866             : 
   12867             :         return ret;
   12868             : }
   12869             : 
   12870             : /*
   12871             :  * Initialize the perf_event context in task_struct
   12872             :  */
   12873        2468 : static int perf_event_init_context(struct task_struct *child, int ctxn)
   12874             : {
   12875        2468 :         struct perf_event_context *child_ctx, *parent_ctx;
   12876        2468 :         struct perf_event_context *cloned_ctx;
   12877        2468 :         struct perf_event *event;
   12878        2468 :         struct task_struct *parent = current;
   12879        2468 :         int inherited_all = 1;
   12880        2468 :         unsigned long flags;
   12881        2468 :         int ret = 0;
   12882             : 
   12883        2468 :         if (likely(!parent->perf_event_ctxp[ctxn]))
   12884             :                 return 0;
   12885             : 
   12886             :         /*
   12887             :          * If the parent's context is a clone, pin it so it won't get
   12888             :          * swapped under us.
   12889             :          */
   12890           0 :         parent_ctx = perf_pin_task_context(parent, ctxn);
   12891           0 :         if (!parent_ctx)
   12892             :                 return 0;
   12893             : 
   12894             :         /*
   12895             :          * No need to check if parent_ctx != NULL here; since we saw
   12896             :          * it non-NULL earlier, the only reason for it to become NULL
   12897             :          * is if we exit, and since we're currently in the middle of
   12898             :          * a fork we can't be exiting at the same time.
   12899             :          */
   12900             : 
   12901             :         /*
   12902             :          * Lock the parent list. No need to lock the child - not PID
   12903             :          * hashed yet and not running, so nobody can access it.
   12904             :          */
   12905           0 :         mutex_lock(&parent_ctx->mutex);
   12906             : 
   12907             :         /*
   12908             :          * We dont have to disable NMIs - we are only looking at
   12909             :          * the list, not manipulating it:
   12910             :          */
   12911           0 :         perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
   12912           0 :                 ret = inherit_task_group(event, parent, parent_ctx,
   12913             :                                          child, ctxn, &inherited_all);
   12914           0 :                 if (ret)
   12915           0 :                         goto out_unlock;
   12916             :         }
   12917             : 
   12918             :         /*
   12919             :          * We can't hold ctx->lock when iterating the ->flexible_group list due
   12920             :          * to allocations, but we need to prevent rotation because
   12921             :          * rotate_ctx() will change the list from interrupt context.
   12922             :          */
   12923           0 :         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
   12924           0 :         parent_ctx->rotate_disable = 1;
   12925           0 :         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
   12926             : 
   12927           0 :         perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
   12928           0 :                 ret = inherit_task_group(event, parent, parent_ctx,
   12929             :                                          child, ctxn, &inherited_all);
   12930           0 :                 if (ret)
   12931           0 :                         goto out_unlock;
   12932             :         }
   12933             : 
   12934           0 :         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
   12935           0 :         parent_ctx->rotate_disable = 0;
   12936             : 
   12937           0 :         child_ctx = child->perf_event_ctxp[ctxn];
   12938             : 
   12939           0 :         if (child_ctx && inherited_all) {
   12940             :                 /*
   12941             :                  * Mark the child context as a clone of the parent
   12942             :                  * context, or of whatever the parent is a clone of.
   12943             :                  *
   12944             :                  * Note that if the parent is a clone, the holding of
   12945             :                  * parent_ctx->lock avoids it from being uncloned.
   12946             :                  */
   12947           0 :                 cloned_ctx = parent_ctx->parent_ctx;
   12948           0 :                 if (cloned_ctx) {
   12949           0 :                         child_ctx->parent_ctx = cloned_ctx;
   12950           0 :                         child_ctx->parent_gen = parent_ctx->parent_gen;
   12951             :                 } else {
   12952           0 :                         child_ctx->parent_ctx = parent_ctx;
   12953           0 :                         child_ctx->parent_gen = parent_ctx->generation;
   12954             :                 }
   12955           0 :                 get_ctx(child_ctx->parent_ctx);
   12956             :         }
   12957             : 
   12958           0 :         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
   12959           0 : out_unlock:
   12960           0 :         mutex_unlock(&parent_ctx->mutex);
   12961             : 
   12962           0 :         perf_unpin_context(parent_ctx);
   12963           0 :         put_ctx(parent_ctx);
   12964             : 
   12965           0 :         return ret;
   12966             : }
   12967             : 
   12968             : /*
   12969             :  * Initialize the perf_event context in task_struct
   12970             :  */
   12971        1234 : int perf_event_init_task(struct task_struct *child)
   12972             : {
   12973        1234 :         int ctxn, ret;
   12974             : 
   12975        1234 :         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
   12976        1234 :         mutex_init(&child->perf_event_mutex);
   12977        1234 :         INIT_LIST_HEAD(&child->perf_event_list);
   12978             : 
   12979        3702 :         for_each_task_context_nr(ctxn) {
   12980        2468 :                 ret = perf_event_init_context(child, ctxn);
   12981        2468 :                 if (ret) {
   12982           0 :                         perf_event_free_task(child);
   12983           0 :                         return ret;
   12984             :                 }
   12985             :         }
   12986             : 
   12987             :         return 0;
   12988             : }
   12989             : 
   12990           1 : static void __init perf_event_init_all_cpus(void)
   12991             : {
   12992           1 :         struct swevent_htable *swhash;
   12993           1 :         int cpu;
   12994             : 
   12995           1 :         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
   12996             : 
   12997           6 :         for_each_possible_cpu(cpu) {
   12998           4 :                 swhash = &per_cpu(swevent_htable, cpu);
   12999           4 :                 mutex_init(&swhash->hlist_mutex);
   13000           4 :                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
   13001             : 
   13002           4 :                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
   13003           4 :                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
   13004             : 
   13005             : #ifdef CONFIG_CGROUP_PERF
   13006             :                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
   13007             : #endif
   13008           5 :                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
   13009             :         }
   13010           1 : }
   13011             : 
   13012           7 : static void perf_swevent_init_cpu(unsigned int cpu)
   13013             : {
   13014           7 :         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
   13015             : 
   13016           7 :         mutex_lock(&swhash->hlist_mutex);
   13017           7 :         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
   13018           0 :                 struct swevent_hlist *hlist;
   13019             : 
   13020           0 :                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
   13021           0 :                 WARN_ON(!hlist);
   13022           0 :                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
   13023             :         }
   13024           7 :         mutex_unlock(&swhash->hlist_mutex);
   13025           7 : }
   13026             : 
   13027             : #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
   13028           0 : static void __perf_event_exit_context(void *__info)
   13029             : {
   13030           0 :         struct perf_event_context *ctx = __info;
   13031           0 :         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
   13032           0 :         struct perf_event *event;
   13033             : 
   13034           0 :         raw_spin_lock(&ctx->lock);
   13035           0 :         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
   13036           0 :         list_for_each_entry(event, &ctx->event_list, event_entry)
   13037           0 :                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
   13038           0 :         raw_spin_unlock(&ctx->lock);
   13039           0 : }
   13040             : 
   13041           0 : static void perf_event_exit_cpu_context(int cpu)
   13042             : {
   13043           0 :         struct perf_cpu_context *cpuctx;
   13044           0 :         struct perf_event_context *ctx;
   13045           0 :         struct pmu *pmu;
   13046             : 
   13047           0 :         mutex_lock(&pmus_lock);
   13048           0 :         list_for_each_entry(pmu, &pmus, entry) {
   13049           0 :                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
   13050           0 :                 ctx = &cpuctx->ctx;
   13051             : 
   13052           0 :                 mutex_lock(&ctx->mutex);
   13053           0 :                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
   13054           0 :                 cpuctx->online = 0;
   13055           0 :                 mutex_unlock(&ctx->mutex);
   13056             :         }
   13057           0 :         cpumask_clear_cpu(cpu, perf_online_mask);
   13058           0 :         mutex_unlock(&pmus_lock);
   13059           0 : }
   13060             : #else
   13061             : 
   13062             : static void perf_event_exit_cpu_context(int cpu) { }
   13063             : 
   13064             : #endif
   13065             : 
   13066           7 : int perf_event_init_cpu(unsigned int cpu)
   13067             : {
   13068           7 :         struct perf_cpu_context *cpuctx;
   13069           7 :         struct perf_event_context *ctx;
   13070           7 :         struct pmu *pmu;
   13071             : 
   13072           7 :         perf_swevent_init_cpu(cpu);
   13073             : 
   13074           7 :         mutex_lock(&pmus_lock);
   13075           7 :         cpumask_set_cpu(cpu, perf_online_mask);
   13076          47 :         list_for_each_entry(pmu, &pmus, entry) {
   13077          40 :                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
   13078          40 :                 ctx = &cpuctx->ctx;
   13079             : 
   13080          40 :                 mutex_lock(&ctx->mutex);
   13081          40 :                 cpuctx->online = 1;
   13082          40 :                 mutex_unlock(&ctx->mutex);
   13083             :         }
   13084           7 :         mutex_unlock(&pmus_lock);
   13085             : 
   13086           7 :         return 0;
   13087             : }
   13088             : 
   13089           0 : int perf_event_exit_cpu(unsigned int cpu)
   13090             : {
   13091           0 :         perf_event_exit_cpu_context(cpu);
   13092           0 :         return 0;
   13093             : }
   13094             : 
   13095             : static int
   13096           0 : perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
   13097             : {
   13098           0 :         int cpu;
   13099             : 
   13100           0 :         for_each_online_cpu(cpu)
   13101           0 :                 perf_event_exit_cpu(cpu);
   13102             : 
   13103           0 :         return NOTIFY_OK;
   13104             : }
   13105             : 
   13106             : /*
   13107             :  * Run the perf reboot notifier at the very last possible moment so that
   13108             :  * the generic watchdog code runs as long as possible.
   13109             :  */
   13110             : static struct notifier_block perf_reboot_notifier = {
   13111             :         .notifier_call = perf_reboot,
   13112             :         .priority = INT_MIN,
   13113             : };
   13114             : 
   13115           1 : void __init perf_event_init(void)
   13116             : {
   13117           1 :         int ret;
   13118             : 
   13119           1 :         idr_init(&pmu_idr);
   13120             : 
   13121           1 :         perf_event_init_all_cpus();
   13122           1 :         init_srcu_struct(&pmus_srcu);
   13123           1 :         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
   13124           1 :         perf_pmu_register(&perf_cpu_clock, NULL, -1);
   13125           1 :         perf_pmu_register(&perf_task_clock, NULL, -1);
   13126           1 :         perf_tp_register();
   13127           1 :         perf_event_init_cpu(smp_processor_id());
   13128           1 :         register_reboot_notifier(&perf_reboot_notifier);
   13129             : 
   13130           1 :         ret = init_hw_breakpoint();
   13131           1 :         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
   13132             : 
   13133             :         /*
   13134             :          * Build time assertion that we keep the data_head at the intended
   13135             :          * location.  IOW, validation we got the __reserved[] size right.
   13136             :          */
   13137           1 :         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
   13138             :                      != 1024);
   13139           1 : }
   13140             : 
   13141           0 : ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
   13142             :                               char *page)
   13143             : {
   13144           0 :         struct perf_pmu_events_attr *pmu_attr =
   13145           0 :                 container_of(attr, struct perf_pmu_events_attr, attr);
   13146             : 
   13147           0 :         if (pmu_attr->event_str)
   13148           0 :                 return sprintf(page, "%s\n", pmu_attr->event_str);
   13149             : 
   13150             :         return 0;
   13151             : }
   13152             : EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
   13153             : 
   13154           1 : static int __init perf_event_sysfs_init(void)
   13155             : {
   13156           1 :         struct pmu *pmu;
   13157           1 :         int ret;
   13158             : 
   13159           1 :         mutex_lock(&pmus_lock);
   13160             : 
   13161           1 :         ret = bus_register(&pmu_bus);
   13162           1 :         if (ret)
   13163           0 :                 goto unlock;
   13164             : 
   13165           8 :         list_for_each_entry(pmu, &pmus, entry) {
   13166           7 :                 if (!pmu->name || pmu->type < 0)
   13167           2 :                         continue;
   13168             : 
   13169           5 :                 ret = pmu_dev_alloc(pmu);
   13170           5 :                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
   13171             :         }
   13172           1 :         pmu_bus_running = 1;
   13173           1 :         ret = 0;
   13174             : 
   13175           1 : unlock:
   13176           1 :         mutex_unlock(&pmus_lock);
   13177             : 
   13178           1 :         return ret;
   13179             : }
   13180             : device_initcall(perf_event_sysfs_init);
   13181             : 
   13182             : #ifdef CONFIG_CGROUP_PERF
   13183             : static struct cgroup_subsys_state *
   13184             : perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
   13185             : {
   13186             :         struct perf_cgroup *jc;
   13187             : 
   13188             :         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
   13189             :         if (!jc)
   13190             :                 return ERR_PTR(-ENOMEM);
   13191             : 
   13192             :         jc->info = alloc_percpu(struct perf_cgroup_info);
   13193             :         if (!jc->info) {
   13194             :                 kfree(jc);
   13195             :                 return ERR_PTR(-ENOMEM);
   13196             :         }
   13197             : 
   13198             :         return &jc->css;
   13199             : }
   13200             : 
   13201             : static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
   13202             : {
   13203             :         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
   13204             : 
   13205             :         free_percpu(jc->info);
   13206             :         kfree(jc);
   13207             : }
   13208             : 
   13209             : static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
   13210             : {
   13211             :         perf_event_cgroup(css->cgroup);
   13212             :         return 0;
   13213             : }
   13214             : 
   13215             : static int __perf_cgroup_move(void *info)
   13216             : {
   13217             :         struct task_struct *task = info;
   13218             :         rcu_read_lock();
   13219             :         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
   13220             :         rcu_read_unlock();
   13221             :         return 0;
   13222             : }
   13223             : 
   13224             : static void perf_cgroup_attach(struct cgroup_taskset *tset)
   13225             : {
   13226             :         struct task_struct *task;
   13227             :         struct cgroup_subsys_state *css;
   13228             : 
   13229             :         cgroup_taskset_for_each(task, css, tset)
   13230             :                 task_function_call(task, __perf_cgroup_move, task);
   13231             : }
   13232             : 
   13233             : struct cgroup_subsys perf_event_cgrp_subsys = {
   13234             :         .css_alloc      = perf_cgroup_css_alloc,
   13235             :         .css_free       = perf_cgroup_css_free,
   13236             :         .css_online     = perf_cgroup_css_online,
   13237             :         .attach         = perf_cgroup_attach,
   13238             :         /*
   13239             :          * Implicitly enable on dfl hierarchy so that perf events can
   13240             :          * always be filtered by cgroup2 path as long as perf_event
   13241             :          * controller is not mounted on a legacy hierarchy.
   13242             :          */
   13243             :         .implicit_on_dfl = true,
   13244             :         .threaded       = true,
   13245             : };
   13246             : #endif /* CONFIG_CGROUP_PERF */

Generated by: LCOV version 1.14