Apply preempt_rt patch-4.9-rt1.patch.xz

[zynq/linux.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 43d43a2d5811d548271c138c0e1ed31d541c19ad..3748cb7b2d6e317d2e22a9372923f17fcc08cdb1 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
         return ret;
  }
  
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
-       struct event_function_struct efs = {
-               .event = event,
-               .func = func,
-               .data = data,
-       };
-
-       int ret = event_function(&efs);
-       WARN_ON_ONCE(ret);
-}
-
  static void event_function_call(struct perf_event *event, event_f func, void *data)
  {
         struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
         raw_spin_unlock_irq(&ctx->lock);
  }
  
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+       struct perf_event_context *ctx = event->ctx;
+       struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct task_struct *task = READ_ONCE(ctx->task);
+       struct perf_event_context *task_ctx = NULL;
+
+       WARN_ON_ONCE(!irqs_disabled());
+
+       if (task) {
+               if (task == TASK_TOMBSTONE)
+                       return;
+
+               task_ctx = ctx;
+       }
+
+       perf_ctx_lock(cpuctx, task_ctx);
+
+       task = ctx->task;
+       if (task == TASK_TOMBSTONE)
+               goto unlock;
+
+       if (task) {
+               /*
+                * We must be either inactive or active and the right task,
+                * otherwise we're screwed, since we cannot IPI to somewhere
+                * else.
+                */
+               if (ctx->is_active) {
+                       if (WARN_ON_ONCE(task != current))
+                               goto unlock;
+
+                       if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+                               goto unlock;
+               }
+       } else {
+               WARN_ON_ONCE(&cpuctx->ctx != ctx);
+       }
+
+       func(event, cpuctx, ctx, data);
+unlock:
+       perf_ctx_unlock(cpuctx, task_ctx);
+}
+
  #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                        PERF_FLAG_FD_OUTPUT  |\
                        PERF_FLAG_PID_CGROUP |\
@@ -335,6 +371,7 @@ static atomic_t perf_sched_count;
  
  static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
  static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
  
  static atomic_t nr_mmap_events __read_mostly;
  static atomic_t nr_comm_events __read_mostly;
@@ -396,6 +433,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
         if (ret || !write)
                 return ret;
  
+       /*
+        * If throttling is disabled don't allow the write:
+        */
+       if (sysctl_perf_cpu_time_max_percent == 100 ||
+           sysctl_perf_cpu_time_max_percent == 0)
+               return -EINVAL;
+
         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
         update_perf_cpu_limits();
@@ -440,7 +484,7 @@ static u64 __report_allowed;
  
  static void perf_duration_warn(struct irq_work *w)
  {
-       printk_ratelimited(KERN_WARNING
+       printk_ratelimited(KERN_INFO
                 "perf: interrupt took too long (%lld > %lld), lowering "
                 "kernel.perf_event_max_sample_rate to %d\n",
                 __report_avg, __report_allowed,
@@ -835,6 +879,40 @@ perf_cgroup_mark_enabled(struct perf_event *event,
                 }
         }
  }
+
+/*
+ * Update cpuctx->cgrp so that it is set when first cgroup event is added and
+ * cleared when last cgroup event is removed.
+ */
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+                        struct perf_event_context *ctx, bool add)
+{
+       struct perf_cpu_context *cpuctx;
+
+       if (!is_cgroup_event(event))
+               return;
+
+       if (add && ctx->nr_cgroups++)
+               return;
+       else if (!add && --ctx->nr_cgroups)
+               return;
+       /*
+        * Because cgroup events are always per-cpu events,
+        * this will always be called from the right CPU.
+        */
+       cpuctx = __get_cpu_context(ctx);
+
+       /*
+        * cpuctx->cgrp is NULL until a cgroup event is sched in or
+        * ctx->nr_cgroup == 0 .
+        */
+       if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
+               cpuctx->cgrp = event->cgrp;
+       else if (!add)
+               cpuctx->cgrp = NULL;
+}
+
  #else /* !CONFIG_CGROUP_PERF */
  
  static inline bool
@@ -912,6 +990,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
                          struct perf_event_context *ctx)
  {
  }
+
+static inline void
+list_update_cgroup_event(struct perf_event *event,
+                        struct perf_event_context *ctx, bool add)
+{
+}
+
  #endif
  
  /*
@@ -965,6 +1050,7 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
         raw_spin_lock_init(&cpuctx->hrtimer_lock);
         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
         timer->function = perf_mux_hrtimer_handler;
+       timer->irqsafe = 1;
  }
  
  static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
@@ -1384,6 +1470,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
  static void
  list_add_event(struct perf_event *event, struct perf_event_context *ctx)
  {
+
         lockdep_assert_held(&ctx->lock);
  
         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1397,15 +1484,13 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
         if (event->group_leader == event) {
                 struct list_head *list;
  
-               if (is_software_event(event))
-                       event->group_flags |= PERF_GROUP_SOFTWARE;
+               event->group_caps = event->event_caps;
  
                 list = ctx_group_list(event, ctx);
                 list_add_tail(&event->group_entry, list);
         }
  
-       if (is_cgroup_event(event))
-               ctx->nr_cgroups++;
+       list_update_cgroup_event(event, ctx, true);
  
         list_add_rcu(&event->event_entry, &ctx->event_list);
         ctx->nr_events++;
@@ -1553,9 +1638,7 @@ static void perf_group_attach(struct perf_event *event)
  
         WARN_ON_ONCE(group_leader->ctx != event->ctx);
  
-       if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
-                       !is_software_event(event))
-               group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
+       group_leader->group_caps &= event->event_caps;
  
         list_add_tail(&event->group_entry, &group_leader->sibling_list);
         group_leader->nr_siblings++;
@@ -1573,8 +1656,6 @@ static void perf_group_attach(struct perf_event *event)
  static void
  list_del_event(struct perf_event *event, struct perf_event_context *ctx)
  {
-       struct perf_cpu_context *cpuctx;
-
         WARN_ON_ONCE(event->ctx != ctx);
         lockdep_assert_held(&ctx->lock);
  
@@ -1586,20 +1667,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
  
         event->attach_state &= ~PERF_ATTACH_CONTEXT;
  
-       if (is_cgroup_event(event)) {
-               ctx->nr_cgroups--;
-               /*
-                * Because cgroup events are always per-cpu events, this will
-                * always be called from the right CPU.
-                */
-               cpuctx = __get_cpu_context(ctx);
-               /*
-                * If there are no more cgroup events then clear cgrp to avoid
-                * stale pointer in update_cgrp_time_from_cpuctx().
-                */
-               if (!ctx->nr_cgroups)
-                       cpuctx->cgrp = NULL;
-       }
+       list_update_cgroup_event(event, ctx, false);
  
         ctx->nr_events--;
         if (event->attr.inherit_stat)
@@ -1661,7 +1729,7 @@ static void perf_group_detach(struct perf_event *event)
                 sibling->group_leader = sibling;
  
                 /* Inherit group flags from the previous leader */
-               sibling->group_flags = event->group_flags;
+               sibling->group_caps = event->group_caps;
  
                 WARN_ON_ONCE(sibling->ctx != event->ctx);
         }
@@ -1708,8 +1776,8 @@ static inline int pmu_filter_match(struct perf_event *event)
  static inline int
  event_filter_match(struct perf_event *event)
  {
-       return (event->cpu == -1 || event->cpu == smp_processor_id())
-           && perf_cgroup_match(event) && pmu_filter_match(event);
+       return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
+              perf_cgroup_match(event) && pmu_filter_match(event);
  }
  
  static void
@@ -1729,8 +1797,8 @@ event_sched_out(struct perf_event *event,
          * maintained, otherwise bogus information is return
          * via read() for time_enabled, time_running:
          */
-       if (event->state == PERF_EVENT_STATE_INACTIVE
-           && !event_filter_match(event)) {
+       if (event->state == PERF_EVENT_STATE_INACTIVE &&
+           !event_filter_match(event)) {
                 delta = tstamp - event->tstamp_stopped;
                 event->tstamp_running += delta;
                 event->tstamp_stopped = tstamp;
@@ -1770,6 +1838,8 @@ group_sched_out(struct perf_event *group_event,
         struct perf_event *event;
         int state = group_event->state;
  
+       perf_pmu_disable(ctx->pmu);
+
         event_sched_out(group_event, cpuctx, ctx);
  
         /*
@@ -1778,6 +1848,8 @@ group_sched_out(struct perf_event *group_event,
         list_for_each_entry(event, &group_event->sibling_list, group_entry)
                 event_sched_out(event, cpuctx, ctx);
  
+       perf_pmu_enable(ctx->pmu);
+
         if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
                 cpuctx->exclusive = 0;
  }
@@ -1897,6 +1969,12 @@ void perf_event_disable(struct perf_event *event)
  }
  EXPORT_SYMBOL_GPL(perf_event_disable);
  
+void perf_event_disable_inatomic(struct perf_event *event)
+{
+       event->pending_disable = 1;
+       irq_work_queue(&event->pending);
+}
+
  static void perf_set_shadow_time(struct perf_event *event,
                                  struct perf_event_context *ctx,
                                  u64 tstamp)
@@ -2083,7 +2161,7 @@ static int group_can_go_on(struct perf_event *event,
         /*
          * Groups consisting entirely of software events can always go on.
          */
-       if (event->group_flags & PERF_GROUP_SOFTWARE)
+       if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                 return 1;
         /*
          * If an exclusive group is already on, no other hardware
@@ -2228,10 +2306,15 @@ perf_install_in_context(struct perf_event_context *ctx,
  
         lockdep_assert_held(&ctx->mutex);
  
-       event->ctx = ctx;
         if (event->cpu != -1)
                 event->cpu = cpu;
  
+       /*
+        * Ensures that if we can observe event->ctx, both the event and ctx
+        * will be 'complete'. See perf_iterate_sb_cpu().
+        */
+       smp_store_release(&event->ctx, ctx);
+
         if (!task) {
                 cpu_function_call(cpu, __perf_install_in_context, event);
                 return;
@@ -2424,16 +2507,16 @@ static int __perf_event_stop(void *info)
          * while restarting.
          */
         if (sd->restart)
-               event->pmu->start(event, PERF_EF_START);
+               event->pmu->start(event, 0);
  
         return 0;
  }
  
-static int perf_event_restart(struct perf_event *event)
+static int perf_event_stop(struct perf_event *event, int restart)
  {
         struct stop_event_data sd = {
                 .event          = event,
-               .restart        = 1,
+               .restart        = restart,
         };
         int ret = 0;
  
@@ -2770,19 +2853,36 @@ unlock:
         }
  }
  
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
  void perf_sched_cb_dec(struct pmu *pmu)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
         this_cpu_dec(perf_sched_cb_usages);
+
+       if (!--cpuctx->sched_cb_usage)
+               list_del(&cpuctx->sched_cb_entry);
  }
  
+
  void perf_sched_cb_inc(struct pmu *pmu)
  {
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+       if (!cpuctx->sched_cb_usage++)
+               list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
         this_cpu_inc(perf_sched_cb_usages);
  }
  
  /*
   * This function provides the context switch callback to the lower code
   * layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
   */
  static void perf_pmu_sched_task(struct task_struct *prev,
                                 struct task_struct *next,
@@ -2790,34 +2890,24 @@ static void perf_pmu_sched_task(struct task_struct *prev,
  {
         struct perf_cpu_context *cpuctx;
         struct pmu *pmu;
-       unsigned long flags;
  
         if (prev == next)
                 return;
  
-       local_irq_save(flags);
-
-       rcu_read_lock();
+       list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+               pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
  
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               if (pmu->sched_task) {
-                       cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-                       perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
-                       perf_pmu_disable(pmu);
+               if (WARN_ON_ONCE(!pmu->sched_task))
+                       continue;
  
-                       pmu->sched_task(cpuctx->task_ctx, sched_in);
+               perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+               perf_pmu_disable(pmu);
  
-                       perf_pmu_enable(pmu);
+               pmu->sched_task(cpuctx->task_ctx, sched_in);
  
-                       perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-               }
+               perf_pmu_enable(pmu);
+               perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
         }
-
-       rcu_read_unlock();
-
-       local_irq_restore(flags);
  }
  
  static void perf_event_switch(struct task_struct *task,
@@ -3349,6 +3439,22 @@ struct perf_read_data {
         int ret;
  };
  
+static int find_cpu_to_read(struct perf_event *event, int local_cpu)
+{
+       int event_cpu = event->oncpu;
+       u16 local_pkg, event_pkg;
+
+       if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
+               event_pkg =  topology_physical_package_id(event_cpu);
+               local_pkg =  topology_physical_package_id(local_cpu);
+
+               if (event_pkg == local_pkg)
+                       return local_cpu;
+       }
+
+       return event_cpu;
+}
+
  /*
   * Cross CPU call to read the hardware event
   */
@@ -3470,7 +3576,7 @@ u64 perf_event_read_local(struct perf_event *event)
  
  static int perf_event_read(struct perf_event *event, bool group)
  {
-       int ret = 0;
+       int ret = 0, cpu_to_read, local_cpu;
  
         /*
          * If event is enabled and currently active on a CPU, update the
@@ -3482,8 +3588,22 @@ static int perf_event_read(struct perf_event *event, bool group)
                         .group = group,
                         .ret = 0,
                 };
-               smp_call_function_single(event->oncpu,
-                                        __perf_event_read, &data, 1);
+
+               local_cpu = get_cpu();
+               cpu_to_read = find_cpu_to_read(event, local_cpu);
+               put_cpu();
+
+               /*
+                * Purposely ignore the smp_call_function_single() return
+                * value.
+                *
+                * If event->oncpu isn't a valid CPU it means the event got
+                * scheduled out and that will have updated the event count.
+                *
+                * Therefore, either way, we'll have an up-to-date event count
+                * after this.
+                */
+               (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1);
                 ret = data.ret;
         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                 struct perf_event_context *ctx = event->ctx;
@@ -3686,6 +3806,39 @@ static void free_event_rcu(struct rcu_head *head)
  static void ring_buffer_attach(struct perf_event *event,
                                struct ring_buffer *rb);
  
+static void detach_sb_event(struct perf_event *event)
+{
+       struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+       raw_spin_lock(&pel->lock);
+       list_del_rcu(&event->sb_list);
+       raw_spin_unlock(&pel->lock);
+}
+
+static bool is_sb_event(struct perf_event *event)
+{
+       struct perf_event_attr *attr = &event->attr;
+
+       if (event->parent)
+               return false;
+
+       if (event->attach_state & PERF_ATTACH_TASK)
+               return false;
+
+       if (attr->mmap || attr->mmap_data || attr->mmap2 ||
+           attr->comm || attr->comm_exec ||
+           attr->task ||
+           attr->context_switch)
+               return true;
+       return false;
+}
+
+static void unaccount_pmu_sb_event(struct perf_event *event)
+{
+       if (is_sb_event(event))
+               detach_sb_event(event);
+}
+
  static void unaccount_event_cpu(struct perf_event *event, int cpu)
  {
         if (event->parent)
@@ -3749,6 +3902,8 @@ static void unaccount_event(struct perf_event *event)
         }
  
         unaccount_event_cpu(event, event->cpu);
+
+       unaccount_pmu_sb_event(event);
  }
  
  static void perf_sched_delayed(struct work_struct *work)
@@ -3818,7 +3973,7 @@ static void exclusive_event_destroy(struct perf_event *event)
  
  static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
  {
-       if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) &&
+       if ((e1->pmu == e2->pmu) &&
             (e1->cpu == e2->cpu ||
              e1->cpu == -1 ||
              e2->cpu == -1))
@@ -4734,6 +4889,19 @@ static void ring_buffer_attach(struct perf_event *event,
                 spin_unlock_irqrestore(&rb->event_lock, flags);
         }
  
+       /*
+        * Avoid racing with perf_mmap_close(AUX): stop the event
+        * before swizzling the event::rb pointer; if it's getting
+        * unmapped, its aux_mmap_count will be 0 and it won't
+        * restart. See the comment in __perf_pmu_output_stop().
+        *
+        * Data will inevitably be lost when set_output is done in
+        * mid-air, but then again, whoever does it like this is
+        * not in for the data anyway.
+        */
+       if (has_aux(event))
+               perf_event_stop(event, 0);
+
         rcu_assign_pointer(event->rb, rb);
  
         if (old_rb) {
@@ -5226,9 +5394,10 @@ perf_output_sample_regs(struct perf_output_handle *handle,
                         struct pt_regs *regs, u64 mask)
  {
         int bit;
+       DECLARE_BITMAP(_mask, 64);
  
-       for_each_set_bit(bit, (const unsigned long *) &mask,
-                        sizeof(mask) * BITS_PER_BYTE) {
+       bitmap_from_u64(_mask, mask);
+       for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                 u64 val;
  
                 val = perf_reg_value(regs, bit);
@@ -5574,16 +5743,26 @@ void perf_output_sample(struct perf_output_handle *handle,
         }
  
         if (sample_type & PERF_SAMPLE_RAW) {
-               if (data->raw) {
-                       u32 raw_size = data->raw->size;
-                       u32 real_size = round_up(raw_size + sizeof(u32),
-                                                sizeof(u64)) - sizeof(u32);
-                       u64 zero = 0;
-
-                       perf_output_put(handle, real_size);
-                       __output_copy(handle, data->raw->data, raw_size);
-                       if (real_size - raw_size)
-                               __output_copy(handle, &zero, real_size - raw_size);
+               struct perf_raw_record *raw = data->raw;
+
+               if (raw) {
+                       struct perf_raw_frag *frag = &raw->frag;
+
+                       perf_output_put(handle, raw->size);
+                       do {
+                               if (frag->copy) {
+                                       __output_custom(handle, frag->copy,
+                                                       frag->data, frag->size);
+                               } else {
+                                       __output_copy(handle, frag->data,
+                                                     frag->size);
+                               }
+                               if (perf_raw_frag_last(frag))
+                                       break;
+                               frag = frag->next;
+                       } while (1);
+                       if (frag->pad)
+                               __output_skip(handle, NULL, frag->pad);
                 } else {
                         struct {
                                 u32     size;
@@ -5708,14 +5887,28 @@ void perf_prepare_sample(struct perf_event_header *header,
         }
  
         if (sample_type & PERF_SAMPLE_RAW) {
-               int size = sizeof(u32);
-
-               if (data->raw)
-                       size += data->raw->size;
-               else
-                       size += sizeof(u32);
+               struct perf_raw_record *raw = data->raw;
+               int size;
+
+               if (raw) {
+                       struct perf_raw_frag *frag = &raw->frag;
+                       u32 sum = 0;
+
+                       do {
+                               sum += frag->size;
+                               if (perf_raw_frag_last(frag))
+                                       break;
+                               frag = frag->next;
+                       } while (1);
+
+                       size = round_up(sum + sizeof(u32), sizeof(u64));
+                       raw->size = size - sizeof(u32);
+                       frag->pad = raw->size - sum;
+               } else {
+                       size = sizeof(u64);
+               }
  
-               header->size += round_up(size, sizeof(u64));
+               header->size += size;
         }
  
         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5875,11 +6068,11 @@ perf_event_read_event(struct perf_event *event,
         perf_output_end(&handle);
  }
  
-typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+typedef void (perf_iterate_f)(struct perf_event *event, void *data);
  
  static void
-perf_event_aux_ctx(struct perf_event_context *ctx,
-                  perf_event_aux_output_cb output,
+perf_iterate_ctx(struct perf_event_context *ctx,
+                  perf_iterate_f output,
                    void *data, bool all)
  {
         struct perf_event *event;
@@ -5896,52 +6089,63 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
         }
  }
  
-static void
-perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
-                       struct perf_event_context *task_ctx)
+static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
  {
-       rcu_read_lock();
-       preempt_disable();
-       perf_event_aux_ctx(task_ctx, output, data, false);
-       preempt_enable();
-       rcu_read_unlock();
+       struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
+       struct perf_event *event;
+
+       list_for_each_entry_rcu(event, &pel->list, sb_list) {
+               /*
+                * Skip events that are not fully formed yet; ensure that
+                * if we observe event->ctx, both event and ctx will be
+                * complete enough. See perf_install_in_context().
+                */
+               if (!smp_load_acquire(&event->ctx))
+                       continue;
+
+               if (event->state < PERF_EVENT_STATE_INACTIVE)
+                       continue;
+               if (!event_filter_match(event))
+                       continue;
+               output(event, data);
+       }
  }
  
+/*
+ * Iterate all events that need to receive side-band events.
+ *
+ * For new callers; ensure that account_pmu_sb_event() includes
+ * your event, otherwise it might not get delivered.
+ */
  static void
-perf_event_aux(perf_event_aux_output_cb output, void *data,
+perf_iterate_sb(perf_iterate_f output, void *data,
                struct perf_event_context *task_ctx)
  {
-       struct perf_cpu_context *cpuctx;
         struct perf_event_context *ctx;
-       struct pmu *pmu;
         int ctxn;
  
+       rcu_read_lock();
+       preempt_disable();
+
         /*
-        * If we have task_ctx != NULL we only notify
-        * the task context itself. The task_ctx is set
-        * only for EXIT events before releasing task
+        * If we have task_ctx != NULL we only notify the task context itself.
+        * The task_ctx is set only for EXIT events before releasing task
          * context.
          */
         if (task_ctx) {
-               perf_event_aux_task_ctx(output, data, task_ctx);
-               return;
+               perf_iterate_ctx(task_ctx, output, data, false);
+               goto done;
         }
  
-       rcu_read_lock();
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
-               if (cpuctx->unique_pmu != pmu)
-                       goto next;
-               perf_event_aux_ctx(&cpuctx->ctx, output, data, false);
-               ctxn = pmu->task_ctx_nr;
-               if (ctxn < 0)
-                       goto next;
+       perf_iterate_sb_cpu(output, data);
+
+       for_each_task_context_nr(ctxn) {
                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                 if (ctx)
-                       perf_event_aux_ctx(ctx, output, data, false);
-next:
-               put_cpu_ptr(pmu->pmu_cpu_context);
+                       perf_iterate_ctx(ctx, output, data, false);
         }
+done:
+       preempt_enable();
         rcu_read_unlock();
  }
  
@@ -5974,7 +6178,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
         raw_spin_unlock_irqrestore(&ifh->lock, flags);
  
         if (restart)
-               perf_event_restart(event);
+               perf_event_stop(event, 1);
  }
  
  void perf_event_exec(void)
@@ -5990,7 +6194,7 @@ void perf_event_exec(void)
  
                 perf_event_enable_on_exec(ctxn);
  
-               perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL,
+               perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
                                    true);
         }
         rcu_read_unlock();
@@ -6018,7 +6222,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
  
         /*
          * In case of inheritance, it will be the parent that links to the
-        * ring-buffer, but it will be the child that's actually using it:
+        * ring-buffer, but it will be the child that's actually using it.
+        *
+        * We are using event::rb to determine if the event should be stopped,
+        * however this may race with ring_buffer_attach() (through set_output),
+        * which will make us skip the event that actually needs to be stopped.
+        * So ring_buffer_attach() has to stop an aux event before re-assigning
+        * its rb pointer.
          */
         if (rcu_dereference(parent->rb) == rb)
                 ro->err = __perf_event_stop(&sd);
@@ -6028,15 +6238,15 @@ static int __perf_pmu_output_stop(void *info)
  {
         struct perf_event *event = info;
         struct pmu *pmu = event->pmu;
-       struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+       struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
         struct remote_output ro = {
                 .rb     = event->rb,
         };
  
         rcu_read_lock();
-       perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
+       perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
         if (cpuctx->task_ctx)
-               perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+               perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                    &ro, false);
         rcu_read_unlock();
  
@@ -6165,7 +6375,7 @@ static void perf_event_task(struct task_struct *task,
                 },
         };
  
-       perf_event_aux(perf_event_task_output,
+       perf_iterate_sb(perf_event_task_output,
                        &task_event,
                        task_ctx);
  }
@@ -6244,7 +6454,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
  
         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
  
-       perf_event_aux(perf_event_comm_output,
+       perf_iterate_sb(perf_event_comm_output,
                        comm_event,
                        NULL);
  }
@@ -6475,22 +6685,13 @@ got_name:
  
         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
  
-       perf_event_aux(perf_event_mmap_output,
+       perf_iterate_sb(perf_event_mmap_output,
                        mmap_event,
                        NULL);
  
         kfree(buf);
  }
  
-/*
- * Whether this @filter depends on a dynamic object which is not loaded
- * yet or its load addresses are not known.
- */
-static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
-{
-       return filter->filter && filter->inode;
-}
-
  /*
   * Check whether inode and address range match filter criteria.
   */
@@ -6541,7 +6742,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
         raw_spin_unlock_irqrestore(&ifh->lock, flags);
  
         if (restart)
-               perf_event_restart(event);
+               perf_event_stop(event, 1);
  }
  
  /*
@@ -6552,13 +6753,20 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
         struct perf_event_context *ctx;
         int ctxn;
  
+       /*
+        * Data tracing isn't supported yet and as such there is no need
+        * to keep track of anything that isn't related to executable code:
+        */
+       if (!(vma->vm_flags & VM_EXEC))
+               return;
+
         rcu_read_lock();
         for_each_task_context_nr(ctxn) {
                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
                 if (!ctx)
                         continue;
  
-               perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true);
+               perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
         }
         rcu_read_unlock();
  }
@@ -6745,7 +6953,7 @@ static void perf_event_switch(struct task_struct *task,
                 },
         };
  
-       perf_event_aux(perf_event_switch_output,
+       perf_iterate_sb(perf_event_switch_output,
                        &switch_event,
                        NULL);
  }
@@ -6882,11 +7090,11 @@ static int __perf_event_overflow(struct perf_event *event,
         if (events && atomic_dec_and_test(&event->event_limit)) {
                 ret = 1;
                 event->pending_kill = POLL_HUP;
-               event->pending_disable = 1;
-               irq_work_queue(&event->pending);
+
+               perf_event_disable_inatomic(event);
         }
  
-       event->overflow_handler(event, data, regs);
+       READ_ONCE(event->overflow_handler)(event, data, regs);
  
         if (*perf_event_fasync(event) && event->pending_kill) {
                 event->pending_wakeup = 1;
@@ -7352,7 +7560,7 @@ static struct pmu perf_swevent = {
  static int perf_tp_filter_match(struct perf_event *event,
                                 struct perf_sample_data *data)
  {
-       void *record = data->raw->data;
+       void *record = data->raw->frag.data;
  
         /* only top level events have filters set */
         if (event->parent)
@@ -7408,8 +7616,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
         struct perf_event *event;
  
         struct perf_raw_record raw = {
-               .size = entry_size,
-               .data = record,
+               .frag = {
+                       .size = entry_size,
+                       .data = record,
+               },
         };
  
         perf_sample_data_init(&data, 0, 0);
@@ -7499,11 +7709,83 @@ static void perf_event_free_filter(struct perf_event *event)
         ftrace_profile_free_filter(event);
  }
  
+#ifdef CONFIG_BPF_SYSCALL
+static void bpf_overflow_handler(struct perf_event *event,
+                                struct perf_sample_data *data,
+                                struct pt_regs *regs)
+{
+       struct bpf_perf_event_data_kern ctx = {
+               .data = data,
+               .regs = regs,
+       };
+       int ret = 0;
+
+       preempt_disable();
+       if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
+               goto out;
+       rcu_read_lock();
+       ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
+       rcu_read_unlock();
+out:
+       __this_cpu_dec(bpf_prog_active);
+       preempt_enable();
+       if (!ret)
+               return;
+
+       event->orig_overflow_handler(event, data, regs);
+}
+
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+       struct bpf_prog *prog;
+
+       if (event->overflow_handler_context)
+               /* hw breakpoint or kernel counter */
+               return -EINVAL;
+
+       if (event->prog)
+               return -EEXIST;
+
+       prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
+       if (IS_ERR(prog))
+               return PTR_ERR(prog);
+
+       event->prog = prog;
+       event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
+       WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
+       return 0;
+}
+
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+       struct bpf_prog *prog = event->prog;
+
+       if (!prog)
+               return;
+
+       WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
+       event->prog = NULL;
+       bpf_prog_put(prog);
+}
+#else
+static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
+{
+       return -EOPNOTSUPP;
+}
+static void perf_event_free_bpf_handler(struct perf_event *event)
+{
+}
+#endif
+
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  {
         bool is_kprobe, is_tracepoint;
         struct bpf_prog *prog;
  
+       if (event->attr.type == PERF_TYPE_HARDWARE ||
+           event->attr.type == PERF_TYPE_SOFTWARE)
+               return perf_event_set_bpf_handler(event, prog_fd);
+
         if (event->attr.type != PERF_TYPE_TRACEPOINT)
                 return -EINVAL;
  
@@ -7544,13 +7826,15 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
  {
         struct bpf_prog *prog;
  
+       perf_event_free_bpf_handler(event);
+
         if (!event->tp_event)
                 return;
  
         prog = event->tp_event->prog;
         if (prog) {
                 event->tp_event->prog = NULL;
-               bpf_prog_put_rcu(prog);
+               bpf_prog_put(prog);
         }
  }
  
@@ -7702,7 +7986,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
         list_for_each_entry(filter, &ifh->list, entry) {
                 event->addr_filters_offs[count] = 0;
  
-               if (perf_addr_filter_needs_mmap(filter))
+               /*
+                * Adjust base offset if the filter is associated to a binary
+                * that needs to be mapped:
+                */
+               if (filter->inode)
                         event->addr_filters_offs[count] =
                                 perf_addr_filter_apply(filter, mm);
  
@@ -7717,7 +8005,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
         mmput(mm);
  
  restart:
-       perf_event_restart(event);
+       perf_event_stop(event, 1);
  }
  
  /*
@@ -7739,6 +8027,7 @@ restart:
   * if <size> is not specified, the range is treated as a single address.
   */
  enum {
+       IF_ACT_NONE = -1,
         IF_ACT_FILTER,
         IF_ACT_START,
         IF_ACT_STOP,
@@ -7762,6 +8051,7 @@ static const match_table_t if_tokens = {
         { IF_SRC_KERNEL,        "%u/%u" },
         { IF_SRC_FILEADDR,      "%u@%s" },
         { IF_SRC_KERNELADDR,    "%u" },
+       { IF_ACT_NONE,          NULL },
  };
  
  /*
@@ -7833,8 +8123,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                         goto fail;
                         }
  
-                       if (token == IF_SRC_FILE) {
-                               filename = match_strdup(&args[2]);
+                       if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
+                               int fpos = filter->range ? 2 : 1;
+
+                               filename = match_strdup(&args[fpos]);
                                 if (!filename) {
                                         ret = -ENOMEM;
                                         goto fail;
@@ -8044,6 +8336,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
  
         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         hwc->hrtimer.function = perf_swevent_hrtimer;
+       hwc->hrtimer.irqsafe = 1;
  
         /*
          * Since hrtimers have a fixed rate, we can do a static freq->period
@@ -8580,7 +8873,10 @@ EXPORT_SYMBOL_GPL(perf_pmu_register);
  
  void perf_pmu_unregister(struct pmu *pmu)
  {
+       int remove_device;
+
         mutex_lock(&pmus_lock);
+       remove_device = pmu_bus_running;
         list_del_rcu(&pmu->entry);
         mutex_unlock(&pmus_lock);
  
@@ -8594,10 +8890,12 @@ void perf_pmu_unregister(struct pmu *pmu)
         free_percpu(pmu->pmu_disable_count);
         if (pmu->type >= PERF_TYPE_MAX)
                 idr_remove(&pmu_idr, pmu->type);
-       if (pmu->nr_addr_filters)
-               device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
-       device_del(pmu->dev);
-       put_device(pmu->dev);
+       if (remove_device) {
+               if (pmu->nr_addr_filters)
+                       device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+               device_del(pmu->dev);
+               put_device(pmu->dev);
+       }
         free_pmu_context(pmu);
  }
  EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -8667,6 +8965,28 @@ unlock:
         return pmu;
  }
  
+static void attach_sb_event(struct perf_event *event)
+{
+       struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
+
+       raw_spin_lock(&pel->lock);
+       list_add_rcu(&event->sb_list, &pel->list);
+       raw_spin_unlock(&pel->lock);
+}
+
+/*
+ * We keep a list of all !task (and therefore per-cpu) events
+ * that need to receive side-band records.
+ *
+ * This avoids having to scan all the various PMU per-cpu contexts
+ * looking for them.
+ */
+static void account_pmu_sb_event(struct perf_event *event)
+{
+       if (is_sb_event(event))
+               attach_sb_event(event);
+}
+
  static void account_event_cpu(struct perf_event *event, int cpu)
  {
         if (event->parent)
@@ -8747,6 +9067,8 @@ static void account_event(struct perf_event *event)
  enabled:
  
         account_event_cpu(event, event->cpu);
+
+       account_pmu_sb_event(event);
  }
  
  /*
@@ -8830,6 +9152,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
         if (!overflow_handler && parent_event) {
                 overflow_handler = parent_event->overflow_handler;
                 context = parent_event->overflow_handler_context;
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
+               if (overflow_handler == bpf_overflow_handler) {
+                       struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
+
+                       if (IS_ERR(prog)) {
+                               err = PTR_ERR(prog);
+                               goto err_ns;
+                       }
+                       event->prog = prog;
+                       event->orig_overflow_handler =
+                               parent_event->orig_overflow_handler;
+               }
+#endif
         }
  
         if (overflow_handler) {
@@ -8895,7 +9230,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
  
         if (!event->parent) {
                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
-                       err = get_callchain_buffers();
+                       err = get_callchain_buffers(attr->sample_max_stack);
                         if (err)
                                 goto err_addr_filters;
                 }
@@ -9217,6 +9552,9 @@ SYSCALL_DEFINE5(perf_event_open,
                         return -EINVAL;
         }
  
+       if (!attr.sample_max_stack)
+               attr.sample_max_stack = sysctl_perf_event_max_stack;
+
         /*
          * In cgroup mode, the pid argument is used to pass the fd
          * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -9290,7 +9628,7 @@ SYSCALL_DEFINE5(perf_event_open,
  
         if (is_sampling_event(event)) {
                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
-                       err = -ENOTSUPP;
+                       err = -EOPNOTSUPP;
                         goto err_alloc;
                 }
         }
@@ -9307,6 +9645,9 @@ SYSCALL_DEFINE5(perf_event_open,
                         goto err_alloc;
         }
  
+       if (pmu->task_ctx_nr == perf_sw_context)
+               event->event_caps |= PERF_EV_CAP_SOFTWARE;
+
         if (group_leader &&
             (is_software_event(event) != is_software_event(group_leader))) {
                 if (is_software_event(event)) {
@@ -9320,7 +9661,7 @@ SYSCALL_DEFINE5(perf_event_open,
                          */
                         pmu = group_leader->pmu;
                 } else if (is_software_event(group_leader) &&
-                          (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
+                          (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                         /*
                          * In case the group is a pure software group, and we
                          * try to add a hardware event, move the whole group to
@@ -10252,10 +10593,15 @@ static void __init perf_event_init_all_cpus(void)
                 swhash = &per_cpu(swevent_htable, cpu);
                 mutex_init(&swhash->hlist_mutex);
                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
+
+               INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
+               raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
+
+               INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
         }
  }
  
-static void perf_event_init_cpu(int cpu)
+int perf_event_init_cpu(unsigned int cpu)
  {
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
@@ -10268,6 +10614,7 @@ static void perf_event_init_cpu(int cpu)
                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
         }
         mutex_unlock(&swhash->hlist_mutex);
+       return 0;
  }
  
  #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
@@ -10299,14 +10646,17 @@ static void perf_event_exit_cpu_context(int cpu)
         }
         srcu_read_unlock(&pmus_srcu, idx);
  }
+#else
+
+static void perf_event_exit_cpu_context(int cpu) { }
  
-static void perf_event_exit_cpu(int cpu)
+#endif
+
+int perf_event_exit_cpu(unsigned int cpu)
  {
         perf_event_exit_cpu_context(cpu);
+       return 0;
  }
-#else
-static inline void perf_event_exit_cpu(int cpu) { }
-#endif
  
  static int
  perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
@@ -10328,46 +10678,6 @@ static struct notifier_block perf_reboot_notifier = {
         .priority = INT_MIN,
  };
  
-static int
-perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
-{
-       unsigned int cpu = (long)hcpu;
-
-       switch (action & ~CPU_TASKS_FROZEN) {
-
-       case CPU_UP_PREPARE:
-               /*
-                * This must be done before the CPU comes alive, because the
-                * moment we can run tasks we can encounter (software) events.
-                *
-                * Specifically, someone can have inherited events on kthreadd
-                * or a pre-existing worker thread that gets re-bound.
-                */
-               perf_event_init_cpu(cpu);
-               break;
-
-       case CPU_DOWN_PREPARE:
-               /*
-                * This must be done before the CPU dies because after that an
-                * active event might want to IPI the CPU and that'll not work
-                * so great for dead CPUs.
-                *
-                * XXX smp_call_function_single() return -ENXIO without a warn
-                * so we could possibly deal with this.
-                *
-                * This is safe against new events arriving because
-                * sys_perf_event_open() serializes against hotplug using
-                * get_online_cpus().
-                */
-               perf_event_exit_cpu(cpu);
-               break;
-       default:
-               break;
-       }
-
-       return NOTIFY_OK;
-}
-
  void __init perf_event_init(void)
  {
         int ret;
@@ -10380,7 +10690,7 @@ void __init perf_event_init(void)
         perf_pmu_register(&perf_cpu_clock, NULL, -1);
         perf_pmu_register(&perf_task_clock, NULL, -1);
         perf_tp_register();
-       perf_cpu_notifier(perf_cpu_notify);
+       perf_event_init_cpu(smp_processor_id());
         register_reboot_notifier(&perf_reboot_notifier);
  
         ret = init_hw_breakpoint();