The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kernel/perf_event.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Performance events core code:
    3  *
    4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
    5  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
    6  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
    7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
    8  *
    9  * For licensing details see kernel-base/COPYING
   10  */
   11 
   12 #include <linux/fs.h>
   13 #include <linux/mm.h>
   14 #include <linux/cpu.h>
   15 #include <linux/smp.h>
   16 #include <linux/file.h>
   17 #include <linux/poll.h>
   18 #include <linux/slab.h>
   19 #include <linux/hash.h>
   20 #include <linux/sysfs.h>
   21 #include <linux/dcache.h>
   22 #include <linux/percpu.h>
   23 #include <linux/ptrace.h>
   24 #include <linux/vmstat.h>
   25 #include <linux/vmalloc.h>
   26 #include <linux/hardirq.h>
   27 #include <linux/rculist.h>
   28 #include <linux/uaccess.h>
   29 #include <linux/syscalls.h>
   30 #include <linux/anon_inodes.h>
   31 #include <linux/kernel_stat.h>
   32 #include <linux/perf_event.h>
   33 #include <linux/ftrace_event.h>
   34 #include <linux/hw_breakpoint.h>
   35 
   36 #include <asm/irq_regs.h>
   37 
   38 /*
   39  * Each CPU has a list of per CPU events:
   40  */
   41 static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
   42 
   43 int perf_max_events __read_mostly = 1;
   44 static int perf_reserved_percpu __read_mostly;
   45 static int perf_overcommit __read_mostly = 1;
   46 
   47 static atomic_t nr_events __read_mostly;
   48 static atomic_t nr_mmap_events __read_mostly;
   49 static atomic_t nr_comm_events __read_mostly;
   50 static atomic_t nr_task_events __read_mostly;
   51 
   52 /*
   53  * perf event paranoia level:
   54  *  -1 - not paranoid at all
   55  *   0 - disallow raw tracepoint access for unpriv
   56  *   1 - disallow cpu events for unpriv
   57  *   2 - disallow kernel profiling for unpriv
   58  */
   59 int sysctl_perf_event_paranoid __read_mostly = 1;
   60 
   61 int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
   62 
   63 /*
   64  * max perf event sample rate
   65  */
   66 int sysctl_perf_event_sample_rate __read_mostly = 100000;
   67 
   68 static atomic64_t perf_event_id;
   69 
   70 /*
   71  * Lock for (sysadmin-configurable) event reservations:
   72  */
   73 static DEFINE_SPINLOCK(perf_resource_lock);
   74 
   75 /*
   76  * Architecture provided APIs - weak aliases:
   77  */
   78 extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
   79 {
   80         return NULL;
   81 }
   82 
   83 void __weak hw_perf_disable(void)               { barrier(); }
   84 void __weak hw_perf_enable(void)                { barrier(); }
   85 
   86 void __weak perf_event_print_debug(void)        { }
   87 
   88 static DEFINE_PER_CPU(int, perf_disable_count);
   89 
   90 void perf_disable(void)
   91 {
   92         if (!__get_cpu_var(perf_disable_count)++)
   93                 hw_perf_disable();
   94 }
   95 
   96 void perf_enable(void)
   97 {
   98         if (!--__get_cpu_var(perf_disable_count))
   99                 hw_perf_enable();
  100 }
  101 
  102 static void get_ctx(struct perf_event_context *ctx)
  103 {
  104         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
  105 }
  106 
  107 static void free_ctx(struct rcu_head *head)
  108 {
  109         struct perf_event_context *ctx;
  110 
  111         ctx = container_of(head, struct perf_event_context, rcu_head);
  112         kfree(ctx);
  113 }
  114 
  115 static void put_ctx(struct perf_event_context *ctx)
  116 {
  117         if (atomic_dec_and_test(&ctx->refcount)) {
  118                 if (ctx->parent_ctx)
  119                         put_ctx(ctx->parent_ctx);
  120                 if (ctx->task)
  121                         put_task_struct(ctx->task);
  122                 call_rcu(&ctx->rcu_head, free_ctx);
  123         }
  124 }
  125 
  126 static void unclone_ctx(struct perf_event_context *ctx)
  127 {
  128         if (ctx->parent_ctx) {
  129                 put_ctx(ctx->parent_ctx);
  130                 ctx->parent_ctx = NULL;
  131         }
  132 }
  133 
  134 /*
  135  * If we inherit events we want to return the parent event id
  136  * to userspace.
  137  */
  138 static u64 primary_event_id(struct perf_event *event)
  139 {
  140         u64 id = event->id;
  141 
  142         if (event->parent)
  143                 id = event->parent->id;
  144 
  145         return id;
  146 }
  147 
  148 /*
  149  * Get the perf_event_context for a task and lock it.
  150  * This has to cope with with the fact that until it is locked,
  151  * the context could get moved to another task.
  152  */
  153 static struct perf_event_context *
  154 perf_lock_task_context(struct task_struct *task, unsigned long *flags)
  155 {
  156         struct perf_event_context *ctx;
  157 
  158         rcu_read_lock();
  159  retry:
  160         ctx = rcu_dereference(task->perf_event_ctxp);
  161         if (ctx) {
  162                 /*
  163                  * If this context is a clone of another, it might
  164                  * get swapped for another underneath us by
  165                  * perf_event_task_sched_out, though the
  166                  * rcu_read_lock() protects us from any context
  167                  * getting freed.  Lock the context and check if it
  168                  * got swapped before we could get the lock, and retry
  169                  * if so.  If we locked the right context, then it
  170                  * can't get swapped on us any more.
  171                  */
  172                 raw_spin_lock_irqsave(&ctx->lock, *flags);
  173                 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
  174                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
  175                         goto retry;
  176                 }
  177 
  178                 if (!atomic_inc_not_zero(&ctx->refcount)) {
  179                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
  180                         ctx = NULL;
  181                 }
  182         }
  183         rcu_read_unlock();
  184         return ctx;
  185 }
  186 
  187 /*
  188  * Get the context for a task and increment its pin_count so it
  189  * can't get swapped to another task.  This also increments its
  190  * reference count so that the context can't get freed.
  191  */
  192 static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
  193 {
  194         struct perf_event_context *ctx;
  195         unsigned long flags;
  196 
  197         ctx = perf_lock_task_context(task, &flags);
  198         if (ctx) {
  199                 ++ctx->pin_count;
  200                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
  201         }
  202         return ctx;
  203 }
  204 
  205 static void perf_unpin_context(struct perf_event_context *ctx)
  206 {
  207         unsigned long flags;
  208 
  209         raw_spin_lock_irqsave(&ctx->lock, flags);
  210         --ctx->pin_count;
  211         raw_spin_unlock_irqrestore(&ctx->lock, flags);
  212         put_ctx(ctx);
  213 }
  214 
  215 static inline u64 perf_clock(void)
  216 {
  217         return local_clock();
  218 }
  219 
  220 /*
  221  * Update the record of the current time in a context.
  222  */
  223 static void update_context_time(struct perf_event_context *ctx)
  224 {
  225         u64 now = perf_clock();
  226 
  227         ctx->time += now - ctx->timestamp;
  228         ctx->timestamp = now;
  229 }
  230 
  231 /*
  232  * Update the total_time_enabled and total_time_running fields for a event.
  233  */
  234 static void update_event_times(struct perf_event *event)
  235 {
  236         struct perf_event_context *ctx = event->ctx;
  237         u64 run_end;
  238 
  239         if (event->state < PERF_EVENT_STATE_INACTIVE ||
  240             event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
  241                 return;
  242 
  243         if (ctx->is_active)
  244                 run_end = ctx->time;
  245         else
  246                 run_end = event->tstamp_stopped;
  247 
  248         event->total_time_enabled = run_end - event->tstamp_enabled;
  249 
  250         if (event->state == PERF_EVENT_STATE_INACTIVE)
  251                 run_end = event->tstamp_stopped;
  252         else
  253                 run_end = ctx->time;
  254 
  255         event->total_time_running = run_end - event->tstamp_running;
  256 }
  257 
  258 /*
  259  * Update total_time_enabled and total_time_running for all events in a group.
  260  */
  261 static void update_group_times(struct perf_event *leader)
  262 {
  263         struct perf_event *event;
  264 
  265         update_event_times(leader);
  266         list_for_each_entry(event, &leader->sibling_list, group_entry)
  267                 update_event_times(event);
  268 }
  269 
  270 static struct list_head *
  271 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
  272 {
  273         if (event->attr.pinned)
  274                 return &ctx->pinned_groups;
  275         else
  276                 return &ctx->flexible_groups;
  277 }
  278 
  279 /*
  280  * Add a event from the lists for its context.
  281  * Must be called with ctx->mutex and ctx->lock held.
  282  */
  283 static void
  284 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
  285 {
  286         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
  287         event->attach_state |= PERF_ATTACH_CONTEXT;
  288 
  289         /*
  290          * If we're a stand alone event or group leader, we go to the context
  291          * list, group events are kept attached to the group so that
  292          * perf_group_detach can, at all times, locate all siblings.
  293          */
  294         if (event->group_leader == event) {
  295                 struct list_head *list;
  296 
  297                 if (is_software_event(event))
  298                         event->group_flags |= PERF_GROUP_SOFTWARE;
  299 
  300                 list = ctx_group_list(event, ctx);
  301                 list_add_tail(&event->group_entry, list);
  302         }
  303 
  304         list_add_rcu(&event->event_entry, &ctx->event_list);
  305         ctx->nr_events++;
  306         if (event->attr.inherit_stat)
  307                 ctx->nr_stat++;
  308 }
  309 
  310 static void perf_group_attach(struct perf_event *event)
  311 {
  312         struct perf_event *group_leader = event->group_leader;
  313 
  314         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
  315         event->attach_state |= PERF_ATTACH_GROUP;
  316 
  317         if (group_leader == event)
  318                 return;
  319 
  320         if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
  321                         !is_software_event(event))
  322                 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
  323 
  324         list_add_tail(&event->group_entry, &group_leader->sibling_list);
  325         group_leader->nr_siblings++;
  326 }
  327 
  328 /*
  329  * Remove a event from the lists for its context.
  330  * Must be called with ctx->mutex and ctx->lock held.
  331  */
  332 static void
  333 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
  334 {
  335         /*
  336          * We can have double detach due to exit/hot-unplug + close.
  337          */
  338         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
  339                 return;
  340 
  341         event->attach_state &= ~PERF_ATTACH_CONTEXT;
  342 
  343         ctx->nr_events--;
  344         if (event->attr.inherit_stat)
  345                 ctx->nr_stat--;
  346 
  347         list_del_rcu(&event->event_entry);
  348 
  349         if (event->group_leader == event)
  350                 list_del_init(&event->group_entry);
  351 
  352         update_group_times(event);
  353 
  354         /*
  355          * If event was in error state, then keep it
  356          * that way, otherwise bogus counts will be
  357          * returned on read(). The only way to get out
  358          * of error state is by explicit re-enabling
  359          * of the event
  360          */
  361         if (event->state > PERF_EVENT_STATE_OFF)
  362                 event->state = PERF_EVENT_STATE_OFF;
  363 }
  364 
  365 static void perf_group_detach(struct perf_event *event)
  366 {
  367         struct perf_event *sibling, *tmp;
  368         struct list_head *list = NULL;
  369 
  370         /*
  371          * We can have double detach due to exit/hot-unplug + close.
  372          */
  373         if (!(event->attach_state & PERF_ATTACH_GROUP))
  374                 return;
  375 
  376         event->attach_state &= ~PERF_ATTACH_GROUP;
  377 
  378         /*
  379          * If this is a sibling, remove it from its group.
  380          */
  381         if (event->group_leader != event) {
  382                 list_del_init(&event->group_entry);
  383                 event->group_leader->nr_siblings--;
  384                 return;
  385         }
  386 
  387         if (!list_empty(&event->group_entry))
  388                 list = &event->group_entry;
  389 
  390         /*
  391          * If this was a group event with sibling events then
  392          * upgrade the siblings to singleton events by adding them
  393          * to whatever list we are on.
  394          */
  395         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
  396                 if (list)
  397                         list_move_tail(&sibling->group_entry, list);
  398                 sibling->group_leader = sibling;
  399 
  400                 /* Inherit group flags from the previous leader */
  401                 sibling->group_flags = event->group_flags;
  402         }
  403 }
  404 
  405 static void
  406 event_sched_out(struct perf_event *event,
  407                   struct perf_cpu_context *cpuctx,
  408                   struct perf_event_context *ctx)
  409 {
  410         if (event->state != PERF_EVENT_STATE_ACTIVE)
  411                 return;
  412 
  413         event->state = PERF_EVENT_STATE_INACTIVE;
  414         if (event->pending_disable) {
  415                 event->pending_disable = 0;
  416                 event->state = PERF_EVENT_STATE_OFF;
  417         }
  418         event->tstamp_stopped = ctx->time;
  419         event->pmu->disable(event);
  420         event->oncpu = -1;
  421 
  422         if (!is_software_event(event))
  423                 cpuctx->active_oncpu--;
  424         ctx->nr_active--;
  425         if (event->attr.exclusive || !cpuctx->active_oncpu)
  426                 cpuctx->exclusive = 0;
  427 }
  428 
  429 static void
  430 group_sched_out(struct perf_event *group_event,
  431                 struct perf_cpu_context *cpuctx,
  432                 struct perf_event_context *ctx)
  433 {
  434         struct perf_event *event;
  435 
  436         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
  437                 return;
  438 
  439         event_sched_out(group_event, cpuctx, ctx);
  440 
  441         /*
  442          * Schedule out siblings (if any):
  443          */
  444         list_for_each_entry(event, &group_event->sibling_list, group_entry)
  445                 event_sched_out(event, cpuctx, ctx);
  446 
  447         if (group_event->attr.exclusive)
  448                 cpuctx->exclusive = 0;
  449 }
  450 
  451 /*
  452  * Cross CPU call to remove a performance event
  453  *
  454  * We disable the event on the hardware level first. After that we
  455  * remove it from the context list.
  456  */
  457 static void __perf_event_remove_from_context(void *info)
  458 {
  459         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  460         struct perf_event *event = info;
  461         struct perf_event_context *ctx = event->ctx;
  462 
  463         /*
  464          * If this is a task context, we need to check whether it is
  465          * the current task context of this cpu. If not it has been
  466          * scheduled out before the smp call arrived.
  467          */
  468         if (ctx->task && cpuctx->task_ctx != ctx)
  469                 return;
  470 
  471         raw_spin_lock(&ctx->lock);
  472         /*
  473          * Protect the list operation against NMI by disabling the
  474          * events on a global level.
  475          */
  476         perf_disable();
  477 
  478         event_sched_out(event, cpuctx, ctx);
  479 
  480         list_del_event(event, ctx);
  481 
  482         if (!ctx->task) {
  483                 /*
  484                  * Allow more per task events with respect to the
  485                  * reservation:
  486                  */
  487                 cpuctx->max_pertask =
  488                         min(perf_max_events - ctx->nr_events,
  489                             perf_max_events - perf_reserved_percpu);
  490         }
  491 
  492         perf_enable();
  493         raw_spin_unlock(&ctx->lock);
  494 }
  495 
  496 
  497 /*
  498  * Remove the event from a task's (or a CPU's) list of events.
  499  *
  500  * Must be called with ctx->mutex held.
  501  *
  502  * CPU events are removed with a smp call. For task events we only
  503  * call when the task is on a CPU.
  504  *
  505  * If event->ctx is a cloned context, callers must make sure that
  506  * every task struct that event->ctx->task could possibly point to
  507  * remains valid.  This is OK when called from perf_release since
  508  * that only calls us on the top-level context, which can't be a clone.
  509  * When called from perf_event_exit_task, it's OK because the
  510  * context has been detached from its task.
  511  */
  512 static void perf_event_remove_from_context(struct perf_event *event)
  513 {
  514         struct perf_event_context *ctx = event->ctx;
  515         struct task_struct *task = ctx->task;
  516 
  517         if (!task) {
  518                 /*
  519                  * Per cpu events are removed via an smp call and
  520                  * the removal is always successful.
  521                  */
  522                 smp_call_function_single(event->cpu,
  523                                          __perf_event_remove_from_context,
  524                                          event, 1);
  525                 return;
  526         }
  527 
  528 retry:
  529         task_oncpu_function_call(task, __perf_event_remove_from_context,
  530                                  event);
  531 
  532         raw_spin_lock_irq(&ctx->lock);
  533         /*
  534          * If the context is active we need to retry the smp call.
  535          */
  536         if (ctx->nr_active && !list_empty(&event->group_entry)) {
  537                 raw_spin_unlock_irq(&ctx->lock);
  538                 goto retry;
  539         }
  540 
  541         /*
  542          * The lock prevents that this context is scheduled in so we
  543          * can remove the event safely, if the call above did not
  544          * succeed.
  545          */
  546         if (!list_empty(&event->group_entry))
  547                 list_del_event(event, ctx);
  548         raw_spin_unlock_irq(&ctx->lock);
  549 }
  550 
  551 /*
  552  * Cross CPU call to disable a performance event
  553  */
  554 static void __perf_event_disable(void *info)
  555 {
  556         struct perf_event *event = info;
  557         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  558         struct perf_event_context *ctx = event->ctx;
  559 
  560         /*
  561          * If this is a per-task event, need to check whether this
  562          * event's task is the current task on this cpu.
  563          */
  564         if (ctx->task && cpuctx->task_ctx != ctx)
  565                 return;
  566 
  567         raw_spin_lock(&ctx->lock);
  568 
  569         /*
  570          * If the event is on, turn it off.
  571          * If it is in error state, leave it in error state.
  572          */
  573         if (event->state >= PERF_EVENT_STATE_INACTIVE) {
  574                 update_context_time(ctx);
  575                 update_group_times(event);
  576                 if (event == event->group_leader)
  577                         group_sched_out(event, cpuctx, ctx);
  578                 else
  579                         event_sched_out(event, cpuctx, ctx);
  580                 event->state = PERF_EVENT_STATE_OFF;
  581         }
  582 
  583         raw_spin_unlock(&ctx->lock);
  584 }
  585 
  586 /*
  587  * Disable a event.
  588  *
  589  * If event->ctx is a cloned context, callers must make sure that
  590  * every task struct that event->ctx->task could possibly point to
  591  * remains valid.  This condition is satisifed when called through
  592  * perf_event_for_each_child or perf_event_for_each because they
  593  * hold the top-level event's child_mutex, so any descendant that
  594  * goes to exit will block in sync_child_event.
  595  * When called from perf_pending_event it's OK because event->ctx
  596  * is the current context on this CPU and preemption is disabled,
  597  * hence we can't get into perf_event_task_sched_out for this context.
  598  */
  599 void perf_event_disable(struct perf_event *event)
  600 {
  601         struct perf_event_context *ctx = event->ctx;
  602         struct task_struct *task = ctx->task;
  603 
  604         if (!task) {
  605                 /*
  606                  * Disable the event on the cpu that it's on
  607                  */
  608                 smp_call_function_single(event->cpu, __perf_event_disable,
  609                                          event, 1);
  610                 return;
  611         }
  612 
  613  retry:
  614         task_oncpu_function_call(task, __perf_event_disable, event);
  615 
  616         raw_spin_lock_irq(&ctx->lock);
  617         /*
  618          * If the event is still active, we need to retry the cross-call.
  619          */
  620         if (event->state == PERF_EVENT_STATE_ACTIVE) {
  621                 raw_spin_unlock_irq(&ctx->lock);
  622                 goto retry;
  623         }
  624 
  625         /*
  626          * Since we have the lock this context can't be scheduled
  627          * in, so we can change the state safely.
  628          */
  629         if (event->state == PERF_EVENT_STATE_INACTIVE) {
  630                 update_group_times(event);
  631                 event->state = PERF_EVENT_STATE_OFF;
  632         }
  633 
  634         raw_spin_unlock_irq(&ctx->lock);
  635 }
  636 
  637 static int
  638 event_sched_in(struct perf_event *event,
  639                  struct perf_cpu_context *cpuctx,
  640                  struct perf_event_context *ctx)
  641 {
  642         if (event->state <= PERF_EVENT_STATE_OFF)
  643                 return 0;
  644 
  645         event->state = PERF_EVENT_STATE_ACTIVE;
  646         event->oncpu = smp_processor_id();
  647         /*
  648          * The new state must be visible before we turn it on in the hardware:
  649          */
  650         smp_wmb();
  651 
  652         if (event->pmu->enable(event)) {
  653                 event->state = PERF_EVENT_STATE_INACTIVE;
  654                 event->oncpu = -1;
  655                 return -EAGAIN;
  656         }
  657 
  658         event->tstamp_running += ctx->time - event->tstamp_stopped;
  659 
  660         if (!is_software_event(event))
  661                 cpuctx->active_oncpu++;
  662         ctx->nr_active++;
  663 
  664         if (event->attr.exclusive)
  665                 cpuctx->exclusive = 1;
  666 
  667         return 0;
  668 }
  669 
  670 static int
  671 group_sched_in(struct perf_event *group_event,
  672                struct perf_cpu_context *cpuctx,
  673                struct perf_event_context *ctx)
  674 {
  675         struct perf_event *event, *partial_group = NULL;
  676         const struct pmu *pmu = group_event->pmu;
  677         bool txn = false;
  678 
  679         if (group_event->state == PERF_EVENT_STATE_OFF)
  680                 return 0;
  681 
  682         /* Check if group transaction availabe */
  683         if (pmu->start_txn)
  684                 txn = true;
  685 
  686         if (txn)
  687                 pmu->start_txn(pmu);
  688 
  689         if (event_sched_in(group_event, cpuctx, ctx)) {
  690                 if (txn)
  691                         pmu->cancel_txn(pmu);
  692                 return -EAGAIN;
  693         }
  694 
  695         /*
  696          * Schedule in siblings as one group (if any):
  697          */
  698         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
  699                 if (event_sched_in(event, cpuctx, ctx)) {
  700                         partial_group = event;
  701                         goto group_error;
  702                 }
  703         }
  704 
  705         if (!txn || !pmu->commit_txn(pmu))
  706                 return 0;
  707 
  708 group_error:
  709         /*
  710          * Groups can be scheduled in as one unit only, so undo any
  711          * partial group before returning:
  712          */
  713         list_for_each_entry(event, &group_event->sibling_list, group_entry) {
  714                 if (event == partial_group)
  715                         break;
  716                 event_sched_out(event, cpuctx, ctx);
  717         }
  718         event_sched_out(group_event, cpuctx, ctx);
  719 
  720         if (txn)
  721                 pmu->cancel_txn(pmu);
  722 
  723         return -EAGAIN;
  724 }
  725 
  726 /*
  727  * Work out whether we can put this event group on the CPU now.
  728  */
  729 static int group_can_go_on(struct perf_event *event,
  730                            struct perf_cpu_context *cpuctx,
  731                            int can_add_hw)
  732 {
  733         /*
  734          * Groups consisting entirely of software events can always go on.
  735          */
  736         if (event->group_flags & PERF_GROUP_SOFTWARE)
  737                 return 1;
  738         /*
  739          * If an exclusive group is already on, no other hardware
  740          * events can go on.
  741          */
  742         if (cpuctx->exclusive)
  743                 return 0;
  744         /*
  745          * If this group is exclusive and there are already
  746          * events on the CPU, it can't go on.
  747          */
  748         if (event->attr.exclusive && cpuctx->active_oncpu)
  749                 return 0;
  750         /*
  751          * Otherwise, try to add it if all previous groups were able
  752          * to go on.
  753          */
  754         return can_add_hw;
  755 }
  756 
  757 static void add_event_to_ctx(struct perf_event *event,
  758                                struct perf_event_context *ctx)
  759 {
  760         list_add_event(event, ctx);
  761         perf_group_attach(event);
  762         event->tstamp_enabled = ctx->time;
  763         event->tstamp_running = ctx->time;
  764         event->tstamp_stopped = ctx->time;
  765 }
  766 
  767 /*
  768  * Cross CPU call to install and enable a performance event
  769  *
  770  * Must be called with ctx->mutex held
  771  */
  772 static void __perf_install_in_context(void *info)
  773 {
  774         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  775         struct perf_event *event = info;
  776         struct perf_event_context *ctx = event->ctx;
  777         struct perf_event *leader = event->group_leader;
  778         int err;
  779 
  780         /*
  781          * If this is a task context, we need to check whether it is
  782          * the current task context of this cpu. If not it has been
  783          * scheduled out before the smp call arrived.
  784          * Or possibly this is the right context but it isn't
  785          * on this cpu because it had no events.
  786          */
  787         if (ctx->task && cpuctx->task_ctx != ctx) {
  788                 if (cpuctx->task_ctx || ctx->task != current)
  789                         return;
  790                 cpuctx->task_ctx = ctx;
  791         }
  792 
  793         raw_spin_lock(&ctx->lock);
  794         ctx->is_active = 1;
  795         update_context_time(ctx);
  796 
  797         /*
  798          * Protect the list operation against NMI by disabling the
  799          * events on a global level. NOP for non NMI based events.
  800          */
  801         perf_disable();
  802 
  803         add_event_to_ctx(event, ctx);
  804 
  805         if (event->cpu != -1 && event->cpu != smp_processor_id())
  806                 goto unlock;
  807 
  808         /*
  809          * Don't put the event on if it is disabled or if
  810          * it is in a group and the group isn't on.
  811          */
  812         if (event->state != PERF_EVENT_STATE_INACTIVE ||
  813             (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
  814                 goto unlock;
  815 
  816         /*
  817          * An exclusive event can't go on if there are already active
  818          * hardware events, and no hardware event can go on if there
  819          * is already an exclusive event on.
  820          */
  821         if (!group_can_go_on(event, cpuctx, 1))
  822                 err = -EEXIST;
  823         else
  824                 err = event_sched_in(event, cpuctx, ctx);
  825 
  826         if (err) {
  827                 /*
  828                  * This event couldn't go on.  If it is in a group
  829                  * then we have to pull the whole group off.
  830                  * If the event group is pinned then put it in error state.
  831                  */
  832                 if (leader != event)
  833                         group_sched_out(leader, cpuctx, ctx);
  834                 if (leader->attr.pinned) {
  835                         update_group_times(leader);
  836                         leader->state = PERF_EVENT_STATE_ERROR;
  837                 }
  838         }
  839 
  840         if (!err && !ctx->task && cpuctx->max_pertask)
  841                 cpuctx->max_pertask--;
  842 
  843  unlock:
  844         perf_enable();
  845 
  846         raw_spin_unlock(&ctx->lock);
  847 }
  848 
  849 /*
  850  * Attach a performance event to a context
  851  *
  852  * First we add the event to the list with the hardware enable bit
  853  * in event->hw_config cleared.
  854  *
  855  * If the event is attached to a task which is on a CPU we use a smp
  856  * call to enable it in the task context. The task might have been
  857  * scheduled away, but we check this in the smp call again.
  858  *
  859  * Must be called with ctx->mutex held.
  860  */
  861 static void
  862 perf_install_in_context(struct perf_event_context *ctx,
  863                         struct perf_event *event,
  864                         int cpu)
  865 {
  866         struct task_struct *task = ctx->task;
  867 
  868         if (!task) {
  869                 /*
  870                  * Per cpu events are installed via an smp call and
  871                  * the install is always successful.
  872                  */
  873                 smp_call_function_single(cpu, __perf_install_in_context,
  874                                          event, 1);
  875                 return;
  876         }
  877 
  878 retry:
  879         task_oncpu_function_call(task, __perf_install_in_context,
  880                                  event);
  881 
  882         raw_spin_lock_irq(&ctx->lock);
  883         /*
  884          * we need to retry the smp call.
  885          */
  886         if (ctx->is_active && list_empty(&event->group_entry)) {
  887                 raw_spin_unlock_irq(&ctx->lock);
  888                 goto retry;
  889         }
  890 
  891         /*
  892          * The lock prevents that this context is scheduled in so we
  893          * can add the event safely, if it the call above did not
  894          * succeed.
  895          */
  896         if (list_empty(&event->group_entry))
  897                 add_event_to_ctx(event, ctx);
  898         raw_spin_unlock_irq(&ctx->lock);
  899 }
  900 
  901 /*
  902  * Put a event into inactive state and update time fields.
  903  * Enabling the leader of a group effectively enables all
  904  * the group members that aren't explicitly disabled, so we
  905  * have to update their ->tstamp_enabled also.
  906  * Note: this works for group members as well as group leaders
  907  * since the non-leader members' sibling_lists will be empty.
  908  */
  909 static void __perf_event_mark_enabled(struct perf_event *event,
  910                                         struct perf_event_context *ctx)
  911 {
  912         struct perf_event *sub;
  913 
  914         event->state = PERF_EVENT_STATE_INACTIVE;
  915         event->tstamp_enabled = ctx->time - event->total_time_enabled;
  916         list_for_each_entry(sub, &event->sibling_list, group_entry)
  917                 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
  918                         sub->tstamp_enabled =
  919                                 ctx->time - sub->total_time_enabled;
  920 }
  921 
  922 /*
  923  * Cross CPU call to enable a performance event
  924  */
  925 static void __perf_event_enable(void *info)
  926 {
  927         struct perf_event *event = info;
  928         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
  929         struct perf_event_context *ctx = event->ctx;
  930         struct perf_event *leader = event->group_leader;
  931         int err;
  932 
  933         /*
  934          * If this is a per-task event, need to check whether this
  935          * event's task is the current task on this cpu.
  936          */
  937         if (ctx->task && cpuctx->task_ctx != ctx) {
  938                 if (cpuctx->task_ctx || ctx->task != current)
  939                         return;
  940                 cpuctx->task_ctx = ctx;
  941         }
  942 
  943         raw_spin_lock(&ctx->lock);
  944         ctx->is_active = 1;
  945         update_context_time(ctx);
  946 
  947         if (event->state >= PERF_EVENT_STATE_INACTIVE)
  948                 goto unlock;
  949         __perf_event_mark_enabled(event, ctx);
  950 
  951         if (event->cpu != -1 && event->cpu != smp_processor_id())
  952                 goto unlock;
  953 
  954         /*
  955          * If the event is in a group and isn't the group leader,
  956          * then don't put it on unless the group is on.
  957          */
  958         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
  959                 goto unlock;
  960 
  961         if (!group_can_go_on(event, cpuctx, 1)) {
  962                 err = -EEXIST;
  963         } else {
  964                 perf_disable();
  965                 if (event == leader)
  966                         err = group_sched_in(event, cpuctx, ctx);
  967                 else
  968                         err = event_sched_in(event, cpuctx, ctx);
  969                 perf_enable();
  970         }
  971 
  972         if (err) {
  973                 /*
  974                  * If this event can't go on and it's part of a
  975                  * group, then the whole group has to come off.
  976                  */
  977                 if (leader != event)
  978                         group_sched_out(leader, cpuctx, ctx);
  979                 if (leader->attr.pinned) {
  980                         update_group_times(leader);
  981                         leader->state = PERF_EVENT_STATE_ERROR;
  982                 }
  983         }
  984 
  985  unlock:
  986         raw_spin_unlock(&ctx->lock);
  987 }
  988 
  989 /*
  990  * Enable a event.
  991  *
  992  * If event->ctx is a cloned context, callers must make sure that
  993  * every task struct that event->ctx->task could possibly point to
  994  * remains valid.  This condition is satisfied when called through
  995  * perf_event_for_each_child or perf_event_for_each as described
  996  * for perf_event_disable.
  997  */
  998 void perf_event_enable(struct perf_event *event)
  999 {
 1000         struct perf_event_context *ctx = event->ctx;
 1001         struct task_struct *task = ctx->task;
 1002 
 1003         if (!task) {
 1004                 /*
 1005                  * Enable the event on the cpu that it's on
 1006                  */
 1007                 smp_call_function_single(event->cpu, __perf_event_enable,
 1008                                          event, 1);
 1009                 return;
 1010         }
 1011 
 1012         raw_spin_lock_irq(&ctx->lock);
 1013         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 1014                 goto out;
 1015 
 1016         /*
 1017          * If the event is in error state, clear that first.
 1018          * That way, if we see the event in error state below, we
 1019          * know that it has gone back into error state, as distinct
 1020          * from the task having been scheduled away before the
 1021          * cross-call arrived.
 1022          */
 1023         if (event->state == PERF_EVENT_STATE_ERROR)
 1024                 event->state = PERF_EVENT_STATE_OFF;
 1025 
 1026  retry:
 1027         raw_spin_unlock_irq(&ctx->lock);
 1028         task_oncpu_function_call(task, __perf_event_enable, event);
 1029 
 1030         raw_spin_lock_irq(&ctx->lock);
 1031 
 1032         /*
 1033          * If the context is active and the event is still off,
 1034          * we need to retry the cross-call.
 1035          */
 1036         if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
 1037                 goto retry;
 1038 
 1039         /*
 1040          * Since we have the lock this context can't be scheduled
 1041          * in, so we can change the state safely.
 1042          */
 1043         if (event->state == PERF_EVENT_STATE_OFF)
 1044                 __perf_event_mark_enabled(event, ctx);
 1045 
 1046  out:
 1047         raw_spin_unlock_irq(&ctx->lock);
 1048 }
 1049 
 1050 static int perf_event_refresh(struct perf_event *event, int refresh)
 1051 {
 1052         /*
 1053          * not supported on inherited events
 1054          */
 1055         if (event->attr.inherit)
 1056                 return -EINVAL;
 1057 
 1058         atomic_add(refresh, &event->event_limit);
 1059         perf_event_enable(event);
 1060 
 1061         return 0;
 1062 }
 1063 
 1064 enum event_type_t {
 1065         EVENT_FLEXIBLE = 0x1,
 1066         EVENT_PINNED = 0x2,
 1067         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 1068 };
 1069 
 1070 static void ctx_sched_out(struct perf_event_context *ctx,
 1071                           struct perf_cpu_context *cpuctx,
 1072                           enum event_type_t event_type)
 1073 {
 1074         struct perf_event *event;
 1075 
 1076         raw_spin_lock(&ctx->lock);
 1077         ctx->is_active = 0;
 1078         if (likely(!ctx->nr_events))
 1079                 goto out;
 1080         update_context_time(ctx);
 1081 
 1082         perf_disable();
 1083         if (!ctx->nr_active)
 1084                 goto out_enable;
 1085 
 1086         if (event_type & EVENT_PINNED)
 1087                 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 1088                         group_sched_out(event, cpuctx, ctx);
 1089 
 1090         if (event_type & EVENT_FLEXIBLE)
 1091                 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 1092                         group_sched_out(event, cpuctx, ctx);
 1093 
 1094  out_enable:
 1095         perf_enable();
 1096  out:
 1097         raw_spin_unlock(&ctx->lock);
 1098 }
 1099 
 1100 /*
 1101  * Test whether two contexts are equivalent, i.e. whether they
 1102  * have both been cloned from the same version of the same context
 1103  * and they both have the same number of enabled events.
 1104  * If the number of enabled events is the same, then the set
 1105  * of enabled events should be the same, because these are both
 1106  * inherited contexts, therefore we can't access individual events
 1107  * in them directly with an fd; we can only enable/disable all
 1108  * events via prctl, or enable/disable all events in a family
 1109  * via ioctl, which will have the same effect on both contexts.
 1110  */
 1111 static int context_equiv(struct perf_event_context *ctx1,
 1112                          struct perf_event_context *ctx2)
 1113 {
 1114         return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
 1115                 && ctx1->parent_gen == ctx2->parent_gen
 1116                 && !ctx1->pin_count && !ctx2->pin_count;
 1117 }
 1118 
 1119 static void __perf_event_sync_stat(struct perf_event *event,
 1120                                      struct perf_event *next_event)
 1121 {
 1122         u64 value;
 1123 
 1124         if (!event->attr.inherit_stat)
 1125                 return;
 1126 
 1127         /*
 1128          * Update the event value, we cannot use perf_event_read()
 1129          * because we're in the middle of a context switch and have IRQs
 1130          * disabled, which upsets smp_call_function_single(), however
 1131          * we know the event must be on the current CPU, therefore we
 1132          * don't need to use it.
 1133          */
 1134         switch (event->state) {
 1135         case PERF_EVENT_STATE_ACTIVE:
 1136                 event->pmu->read(event);
 1137                 /* fall-through */
 1138 
 1139         case PERF_EVENT_STATE_INACTIVE:
 1140                 update_event_times(event);
 1141                 break;
 1142 
 1143         default:
 1144                 break;
 1145         }
 1146 
 1147         /*
 1148          * In order to keep per-task stats reliable we need to flip the event
 1149          * values when we flip the contexts.
 1150          */
 1151         value = local64_read(&next_event->count);
 1152         value = local64_xchg(&event->count, value);
 1153         local64_set(&next_event->count, value);
 1154 
 1155         swap(event->total_time_enabled, next_event->total_time_enabled);
 1156         swap(event->total_time_running, next_event->total_time_running);
 1157 
 1158         /*
 1159          * Since we swizzled the values, update the user visible data too.
 1160          */
 1161         perf_event_update_userpage(event);
 1162         perf_event_update_userpage(next_event);
 1163 }
 1164 
 1165 #define list_next_entry(pos, member) \
 1166         list_entry(pos->member.next, typeof(*pos), member)
 1167 
 1168 static void perf_event_sync_stat(struct perf_event_context *ctx,
 1169                                    struct perf_event_context *next_ctx)
 1170 {
 1171         struct perf_event *event, *next_event;
 1172 
 1173         if (!ctx->nr_stat)
 1174                 return;
 1175 
 1176         update_context_time(ctx);
 1177 
 1178         event = list_first_entry(&ctx->event_list,
 1179                                    struct perf_event, event_entry);
 1180 
 1181         next_event = list_first_entry(&next_ctx->event_list,
 1182                                         struct perf_event, event_entry);
 1183 
 1184         while (&event->event_entry != &ctx->event_list &&
 1185                &next_event->event_entry != &next_ctx->event_list) {
 1186 
 1187                 __perf_event_sync_stat(event, next_event);
 1188 
 1189                 event = list_next_entry(event, event_entry);
 1190                 next_event = list_next_entry(next_event, event_entry);
 1191         }
 1192 }
 1193 
 1194 /*
 1195  * Called from scheduler to remove the events of the current task,
 1196  * with interrupts disabled.
 1197  *
 1198  * We stop each event and update the event value in event->count.
 1199  *
 1200  * This does not protect us against NMI, but disable()
 1201  * sets the disabled bit in the control field of event _before_
 1202  * accessing the event control register. If a NMI hits, then it will
 1203  * not restart the event.
 1204  */
 1205 void perf_event_task_sched_out(struct task_struct *task,
 1206                                  struct task_struct *next)
 1207 {
 1208         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 1209         struct perf_event_context *ctx = task->perf_event_ctxp;
 1210         struct perf_event_context *next_ctx;
 1211         struct perf_event_context *parent;
 1212         int do_switch = 1;
 1213 
 1214         perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
 1215 
 1216         if (likely(!ctx || !cpuctx->task_ctx))
 1217                 return;
 1218 
 1219         rcu_read_lock();
 1220         parent = rcu_dereference(ctx->parent_ctx);
 1221         next_ctx = next->perf_event_ctxp;
 1222         if (parent && next_ctx &&
 1223             rcu_dereference(next_ctx->parent_ctx) == parent) {
 1224                 /*
 1225                  * Looks like the two contexts are clones, so we might be
 1226                  * able to optimize the context switch.  We lock both
 1227                  * contexts and check that they are clones under the
 1228                  * lock (including re-checking that neither has been
 1229                  * uncloned in the meantime).  It doesn't matter which
 1230                  * order we take the locks because no other cpu could
 1231                  * be trying to lock both of these tasks.
 1232                  */
 1233                 raw_spin_lock(&ctx->lock);
 1234                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 1235                 if (context_equiv(ctx, next_ctx)) {
 1236                         /*
 1237                          * XXX do we need a memory barrier of sorts
 1238                          * wrt to rcu_dereference() of perf_event_ctxp
 1239                          */
 1240                         task->perf_event_ctxp = next_ctx;
 1241                         next->perf_event_ctxp = ctx;
 1242                         ctx->task = next;
 1243                         next_ctx->task = task;
 1244                         do_switch = 0;
 1245 
 1246                         perf_event_sync_stat(ctx, next_ctx);
 1247                 }
 1248                 raw_spin_unlock(&next_ctx->lock);
 1249                 raw_spin_unlock(&ctx->lock);
 1250         }
 1251         rcu_read_unlock();
 1252 
 1253         if (do_switch) {
 1254                 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
 1255                 cpuctx->task_ctx = NULL;
 1256         }
 1257 }
 1258 
 1259 static void task_ctx_sched_out(struct perf_event_context *ctx,
 1260                                enum event_type_t event_type)
 1261 {
 1262         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 1263 
 1264         if (!cpuctx->task_ctx)
 1265                 return;
 1266 
 1267         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
 1268                 return;
 1269 
 1270         ctx_sched_out(ctx, cpuctx, event_type);
 1271         cpuctx->task_ctx = NULL;
 1272 }
 1273 
 1274 /*
 1275  * Called with IRQs disabled
 1276  */
 1277 static void __perf_event_task_sched_out(struct perf_event_context *ctx)
 1278 {
 1279         task_ctx_sched_out(ctx, EVENT_ALL);
 1280 }
 1281 
 1282 /*
 1283  * Called with IRQs disabled
 1284  */
 1285 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 1286                               enum event_type_t event_type)
 1287 {
 1288         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
 1289 }
 1290 
 1291 static void
 1292 ctx_pinned_sched_in(struct perf_event_context *ctx,
 1293                     struct perf_cpu_context *cpuctx)
 1294 {
 1295         struct perf_event *event;
 1296 
 1297         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 1298                 if (event->state <= PERF_EVENT_STATE_OFF)
 1299                         continue;
 1300                 if (event->cpu != -1 && event->cpu != smp_processor_id())
 1301                         continue;
 1302 
 1303                 if (group_can_go_on(event, cpuctx, 1))
 1304                         group_sched_in(event, cpuctx, ctx);
 1305 
 1306                 /*
 1307                  * If this pinned group hasn't been scheduled,
 1308                  * put it in error state.
 1309                  */
 1310                 if (event->state == PERF_EVENT_STATE_INACTIVE) {
 1311                         update_group_times(event);
 1312                         event->state = PERF_EVENT_STATE_ERROR;
 1313                 }
 1314         }
 1315 }
 1316 
 1317 static void
 1318 ctx_flexible_sched_in(struct perf_event_context *ctx,
 1319                       struct perf_cpu_context *cpuctx)
 1320 {
 1321         struct perf_event *event;
 1322         int can_add_hw = 1;
 1323 
 1324         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
 1325                 /* Ignore events in OFF or ERROR state */
 1326                 if (event->state <= PERF_EVENT_STATE_OFF)
 1327                         continue;
 1328                 /*
 1329                  * Listen to the 'cpu' scheduling filter constraint
 1330                  * of events:
 1331                  */
 1332                 if (event->cpu != -1 && event->cpu != smp_processor_id())
 1333                         continue;
 1334 
 1335                 if (group_can_go_on(event, cpuctx, can_add_hw))
 1336                         if (group_sched_in(event, cpuctx, ctx))
 1337                                 can_add_hw = 0;
 1338         }
 1339 }
 1340 
 1341 static void
 1342 ctx_sched_in(struct perf_event_context *ctx,
 1343              struct perf_cpu_context *cpuctx,
 1344              enum event_type_t event_type)
 1345 {
 1346         raw_spin_lock(&ctx->lock);
 1347         ctx->is_active = 1;
 1348         if (likely(!ctx->nr_events))
 1349                 goto out;
 1350 
 1351         ctx->timestamp = perf_clock();
 1352 
 1353         perf_disable();
 1354 
 1355         /*
 1356          * First go through the list and put on any pinned groups
 1357          * in order to give them the best chance of going on.
 1358          */
 1359         if (event_type & EVENT_PINNED)
 1360                 ctx_pinned_sched_in(ctx, cpuctx);
 1361 
 1362         /* Then walk through the lower prio flexible groups */
 1363         if (event_type & EVENT_FLEXIBLE)
 1364                 ctx_flexible_sched_in(ctx, cpuctx);
 1365 
 1366         perf_enable();
 1367  out:
 1368         raw_spin_unlock(&ctx->lock);
 1369 }
 1370 
 1371 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 1372                              enum event_type_t event_type)
 1373 {
 1374         struct perf_event_context *ctx = &cpuctx->ctx;
 1375 
 1376         ctx_sched_in(ctx, cpuctx, event_type);
 1377 }
 1378 
 1379 static void task_ctx_sched_in(struct task_struct *task,
 1380                               enum event_type_t event_type)
 1381 {
 1382         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 1383         struct perf_event_context *ctx = task->perf_event_ctxp;
 1384 
 1385         if (likely(!ctx))
 1386                 return;
 1387         if (cpuctx->task_ctx == ctx)
 1388                 return;
 1389         ctx_sched_in(ctx, cpuctx, event_type);
 1390         cpuctx->task_ctx = ctx;
 1391 }
 1392 /*
 1393  * Called from scheduler to add the events of the current task
 1394  * with interrupts disabled.
 1395  *
 1396  * We restore the event value and then enable it.
 1397  *
 1398  * This does not protect us against NMI, but enable()
 1399  * sets the enabled bit in the control field of event _before_
 1400  * accessing the event control register. If a NMI hits, then it will
 1401  * keep the event running.
 1402  */
 1403 void perf_event_task_sched_in(struct task_struct *task)
 1404 {
 1405         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 1406         struct perf_event_context *ctx = task->perf_event_ctxp;
 1407 
 1408         if (likely(!ctx))
 1409                 return;
 1410 
 1411         if (cpuctx->task_ctx == ctx)
 1412                 return;
 1413 
 1414         perf_disable();
 1415 
 1416         /*
 1417          * We want to keep the following priority order:
 1418          * cpu pinned (that don't need to move), task pinned,
 1419          * cpu flexible, task flexible.
 1420          */
 1421         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 1422 
 1423         ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
 1424         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 1425         ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
 1426 
 1427         cpuctx->task_ctx = ctx;
 1428 
 1429         perf_enable();
 1430 }
 1431 
 1432 #define MAX_INTERRUPTS (~0ULL)
 1433 
 1434 static void perf_log_throttle(struct perf_event *event, int enable);
 1435 
 1436 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
 1437 {
 1438         u64 frequency = event->attr.sample_freq;
 1439         u64 sec = NSEC_PER_SEC;
 1440         u64 divisor, dividend;
 1441 
 1442         int count_fls, nsec_fls, frequency_fls, sec_fls;
 1443 
 1444         count_fls = fls64(count);
 1445         nsec_fls = fls64(nsec);
 1446         frequency_fls = fls64(frequency);
 1447         sec_fls = 30;
 1448 
 1449         /*
 1450          * We got @count in @nsec, with a target of sample_freq HZ
 1451          * the target period becomes:
 1452          *
 1453          *             @count * 10^9
 1454          * period = -------------------
 1455          *          @nsec * sample_freq
 1456          *
 1457          */
 1458 
 1459         /*
 1460          * Reduce accuracy by one bit such that @a and @b converge
 1461          * to a similar magnitude.
 1462          */
 1463 #define REDUCE_FLS(a, b)                \
 1464 do {                                    \
 1465         if (a##_fls > b##_fls) {        \
 1466                 a >>= 1;                \
 1467                 a##_fls--;              \
 1468         } else {                        \
 1469                 b >>= 1;                \
 1470                 b##_fls--;              \
 1471         }                               \
 1472 } while (0)
 1473 
 1474         /*
 1475          * Reduce accuracy until either term fits in a u64, then proceed with
 1476          * the other, so that finally we can do a u64/u64 division.
 1477          */
 1478         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
 1479                 REDUCE_FLS(nsec, frequency);
 1480                 REDUCE_FLS(sec, count);
 1481         }
 1482 
 1483         if (count_fls + sec_fls > 64) {
 1484                 divisor = nsec * frequency;
 1485 
 1486                 while (count_fls + sec_fls > 64) {
 1487                         REDUCE_FLS(count, sec);
 1488                         divisor >>= 1;
 1489                 }
 1490 
 1491                 dividend = count * sec;
 1492         } else {
 1493                 dividend = count * sec;
 1494 
 1495                 while (nsec_fls + frequency_fls > 64) {
 1496                         REDUCE_FLS(nsec, frequency);
 1497                         dividend >>= 1;
 1498                 }
 1499 
 1500                 divisor = nsec * frequency;
 1501         }
 1502 
 1503         if (!divisor)
 1504                 return dividend;
 1505 
 1506         return div64_u64(dividend, divisor);
 1507 }
 1508 
 1509 static void perf_event_stop(struct perf_event *event)
 1510 {
 1511         if (!event->pmu->stop)
 1512                 return event->pmu->disable(event);
 1513 
 1514         return event->pmu->stop(event);
 1515 }
 1516 
 1517 static int perf_event_start(struct perf_event *event)
 1518 {
 1519         if (!event->pmu->start)
 1520                 return event->pmu->enable(event);
 1521 
 1522         return event->pmu->start(event);
 1523 }
 1524 
 1525 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
 1526 {
 1527         struct hw_perf_event *hwc = &event->hw;
 1528         s64 period, sample_period;
 1529         s64 delta;
 1530 
 1531         period = perf_calculate_period(event, nsec, count);
 1532 
 1533         delta = (s64)(period - hwc->sample_period);
 1534         delta = (delta + 7) / 8; /* low pass filter */
 1535 
 1536         sample_period = hwc->sample_period + delta;
 1537 
 1538         if (!sample_period)
 1539                 sample_period = 1;
 1540 
 1541         hwc->sample_period = sample_period;
 1542 
 1543         if (local64_read(&hwc->period_left) > 8*sample_period) {
 1544                 perf_disable();
 1545                 perf_event_stop(event);
 1546                 local64_set(&hwc->period_left, 0);
 1547                 perf_event_start(event);
 1548                 perf_enable();
 1549         }
 1550 }
 1551 
 1552 static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 1553 {
 1554         struct perf_event *event;
 1555         struct hw_perf_event *hwc;
 1556         u64 interrupts, now;
 1557         s64 delta;
 1558 
 1559         raw_spin_lock(&ctx->lock);
 1560         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 1561                 if (event->state != PERF_EVENT_STATE_ACTIVE)
 1562                         continue;
 1563 
 1564                 if (event->cpu != -1 && event->cpu != smp_processor_id())
 1565                         continue;
 1566 
 1567                 hwc = &event->hw;
 1568 
 1569                 interrupts = hwc->interrupts;
 1570                 hwc->interrupts = 0;
 1571 
 1572                 /*
 1573                  * unthrottle events on the tick
 1574                  */
 1575                 if (interrupts == MAX_INTERRUPTS) {
 1576                         perf_log_throttle(event, 1);
 1577                         perf_disable();
 1578                         event->pmu->unthrottle(event);
 1579                         perf_enable();
 1580                 }
 1581 
 1582                 if (!event->attr.freq || !event->attr.sample_freq)
 1583                         continue;
 1584 
 1585                 perf_disable();
 1586                 event->pmu->read(event);
 1587                 now = local64_read(&event->count);
 1588                 delta = now - hwc->freq_count_stamp;
 1589                 hwc->freq_count_stamp = now;
 1590 
 1591                 if (delta > 0)
 1592                         perf_adjust_period(event, TICK_NSEC, delta);
 1593                 perf_enable();
 1594         }
 1595         raw_spin_unlock(&ctx->lock);
 1596 }
 1597 
 1598 /*
 1599  * Round-robin a context's events:
 1600  */
 1601 static void rotate_ctx(struct perf_event_context *ctx)
 1602 {
 1603         raw_spin_lock(&ctx->lock);
 1604 
 1605         /* Rotate the first entry last of non-pinned groups */
 1606         list_rotate_left(&ctx->flexible_groups);
 1607 
 1608         raw_spin_unlock(&ctx->lock);
 1609 }
 1610 
 1611 void perf_event_task_tick(struct task_struct *curr)
 1612 {
 1613         struct perf_cpu_context *cpuctx;
 1614         struct perf_event_context *ctx;
 1615         int rotate = 0;
 1616 
 1617         if (!atomic_read(&nr_events))
 1618                 return;
 1619 
 1620         cpuctx = &__get_cpu_var(perf_cpu_context);
 1621         if (cpuctx->ctx.nr_events &&
 1622             cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 1623                 rotate = 1;
 1624 
 1625         ctx = curr->perf_event_ctxp;
 1626         if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
 1627                 rotate = 1;
 1628 
 1629         perf_ctx_adjust_freq(&cpuctx->ctx);
 1630         if (ctx)
 1631                 perf_ctx_adjust_freq(ctx);
 1632 
 1633         if (!rotate)
 1634                 return;
 1635 
 1636         perf_disable();
 1637         cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 1638         if (ctx)
 1639                 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
 1640 
 1641         rotate_ctx(&cpuctx->ctx);
 1642         if (ctx)
 1643                 rotate_ctx(ctx);
 1644 
 1645         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
 1646         if (ctx)
 1647                 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
 1648         perf_enable();
 1649 }
 1650 
 1651 static int event_enable_on_exec(struct perf_event *event,
 1652                                 struct perf_event_context *ctx)
 1653 {
 1654         if (!event->attr.enable_on_exec)
 1655                 return 0;
 1656 
 1657         event->attr.enable_on_exec = 0;
 1658         if (event->state >= PERF_EVENT_STATE_INACTIVE)
 1659                 return 0;
 1660 
 1661         __perf_event_mark_enabled(event, ctx);
 1662 
 1663         return 1;
 1664 }
 1665 
 1666 /*
 1667  * Enable all of a task's events that have been marked enable-on-exec.
 1668  * This expects task == current.
 1669  */
 1670 static void perf_event_enable_on_exec(struct task_struct *task)
 1671 {
 1672         struct perf_event_context *ctx;
 1673         struct perf_event *event;
 1674         unsigned long flags;
 1675         int enabled = 0;
 1676         int ret;
 1677 
 1678         local_irq_save(flags);
 1679         ctx = task->perf_event_ctxp;
 1680         if (!ctx || !ctx->nr_events)
 1681                 goto out;
 1682 
 1683         __perf_event_task_sched_out(ctx);
 1684 
 1685         raw_spin_lock(&ctx->lock);
 1686 
 1687         list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 1688                 ret = event_enable_on_exec(event, ctx);
 1689                 if (ret)
 1690                         enabled = 1;
 1691         }
 1692 
 1693         list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
 1694                 ret = event_enable_on_exec(event, ctx);
 1695                 if (ret)
 1696                         enabled = 1;
 1697         }
 1698 
 1699         /*
 1700          * Unclone this context if we enabled any event.
 1701          */
 1702         if (enabled)
 1703                 unclone_ctx(ctx);
 1704 
 1705         raw_spin_unlock(&ctx->lock);
 1706 
 1707         perf_event_task_sched_in(task);
 1708  out:
 1709         local_irq_restore(flags);
 1710 }
 1711 
 1712 /*
 1713  * Cross CPU call to read the hardware event
 1714  */
 1715 static void __perf_event_read(void *info)
 1716 {
 1717         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 1718         struct perf_event *event = info;
 1719         struct perf_event_context *ctx = event->ctx;
 1720 
 1721         /*
 1722          * If this is a task context, we need to check whether it is
 1723          * the current task context of this cpu.  If not it has been
 1724          * scheduled out before the smp call arrived.  In that case
 1725          * event->count would have been updated to a recent sample
 1726          * when the event was scheduled out.
 1727          */
 1728         if (ctx->task && cpuctx->task_ctx != ctx)
 1729                 return;
 1730 
 1731         raw_spin_lock(&ctx->lock);
 1732         update_context_time(ctx);
 1733         update_event_times(event);
 1734         raw_spin_unlock(&ctx->lock);
 1735 
 1736         event->pmu->read(event);
 1737 }
 1738 
 1739 static inline u64 perf_event_count(struct perf_event *event)
 1740 {
 1741         return local64_read(&event->count) + atomic64_read(&event->child_count);
 1742 }
 1743 
 1744 static u64 perf_event_read(struct perf_event *event)
 1745 {
 1746         /*
 1747          * If event is enabled and currently active on a CPU, update the
 1748          * value in the event structure:
 1749          */
 1750         if (event->state == PERF_EVENT_STATE_ACTIVE) {
 1751                 smp_call_function_single(event->oncpu,
 1752                                          __perf_event_read, event, 1);
 1753         } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 1754                 struct perf_event_context *ctx = event->ctx;
 1755                 unsigned long flags;
 1756 
 1757                 raw_spin_lock_irqsave(&ctx->lock, flags);
 1758                 update_context_time(ctx);
 1759                 update_event_times(event);
 1760                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 1761         }
 1762 
 1763         return perf_event_count(event);
 1764 }
 1765 
 1766 /*
 1767  * Initialize the perf_event context in a task_struct:
 1768  */
 1769 static void
 1770 __perf_event_init_context(struct perf_event_context *ctx,
 1771                             struct task_struct *task)
 1772 {
 1773         raw_spin_lock_init(&ctx->lock);
 1774         mutex_init(&ctx->mutex);
 1775         INIT_LIST_HEAD(&ctx->pinned_groups);
 1776         INIT_LIST_HEAD(&ctx->flexible_groups);
 1777         INIT_LIST_HEAD(&ctx->event_list);
 1778         atomic_set(&ctx->refcount, 1);
 1779         ctx->task = task;
 1780 }
 1781 
 1782 static struct perf_event_context *find_get_context(pid_t pid, int cpu)
 1783 {
 1784         struct perf_event_context *ctx;
 1785         struct perf_cpu_context *cpuctx;
 1786         struct task_struct *task;
 1787         unsigned long flags;
 1788         int err;
 1789 
 1790         if (pid == -1 && cpu != -1) {
 1791                 /* Must be root to operate on a CPU event: */
 1792                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 1793                         return ERR_PTR(-EACCES);
 1794 
 1795                 if (cpu < 0 || cpu >= nr_cpumask_bits)
 1796                         return ERR_PTR(-EINVAL);
 1797 
 1798                 /*
 1799                  * We could be clever and allow to attach a event to an
 1800                  * offline CPU and activate it when the CPU comes up, but
 1801                  * that's for later.
 1802                  */
 1803                 if (!cpu_online(cpu))
 1804                         return ERR_PTR(-ENODEV);
 1805 
 1806                 cpuctx = &per_cpu(perf_cpu_context, cpu);
 1807                 ctx = &cpuctx->ctx;
 1808                 get_ctx(ctx);
 1809 
 1810                 return ctx;
 1811         }
 1812 
 1813         rcu_read_lock();
 1814         if (!pid)
 1815                 task = current;
 1816         else
 1817                 task = find_task_by_vpid(pid);
 1818         if (task)
 1819                 get_task_struct(task);
 1820         rcu_read_unlock();
 1821 
 1822         if (!task)
 1823                 return ERR_PTR(-ESRCH);
 1824 
 1825         /*
 1826          * Can't attach events to a dying task.
 1827          */
 1828         err = -ESRCH;
 1829         if (task->flags & PF_EXITING)
 1830                 goto errout;
 1831 
 1832         /* Reuse ptrace permission checks for now. */
 1833         err = -EACCES;
 1834         if (!ptrace_may_access(task, PTRACE_MODE_READ))
 1835                 goto errout;
 1836 
 1837  retry:
 1838         ctx = perf_lock_task_context(task, &flags);
 1839         if (ctx) {
 1840                 unclone_ctx(ctx);
 1841                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
 1842         }
 1843 
 1844         if (!ctx) {
 1845                 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
 1846                 err = -ENOMEM;
 1847                 if (!ctx)
 1848                         goto errout;
 1849                 __perf_event_init_context(ctx, task);
 1850                 get_ctx(ctx);
 1851                 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) {
 1852                         /*
 1853                          * We raced with some other task; use
 1854                          * the context they set.
 1855                          */
 1856                         kfree(ctx);
 1857                         goto retry;
 1858                 }
 1859                 get_task_struct(task);
 1860         }
 1861 
 1862         put_task_struct(task);
 1863         return ctx;
 1864 
 1865  errout:
 1866         put_task_struct(task);
 1867         return ERR_PTR(err);
 1868 }
 1869 
 1870 static void perf_event_free_filter(struct perf_event *event);
 1871 
 1872 static void free_event_rcu(struct rcu_head *head)
 1873 {
 1874         struct perf_event *event;
 1875 
 1876         event = container_of(head, struct perf_event, rcu_head);
 1877         if (event->ns)
 1878                 put_pid_ns(event->ns);
 1879         perf_event_free_filter(event);
 1880         kfree(event);
 1881 }
 1882 
 1883 static void perf_pending_sync(struct perf_event *event);
 1884 static void perf_buffer_put(struct perf_buffer *buffer);
 1885 
 1886 static void free_event(struct perf_event *event)
 1887 {
 1888         perf_pending_sync(event);
 1889 
 1890         if (!event->parent) {
 1891                 atomic_dec(&nr_events);
 1892                 if (event->attr.mmap || event->attr.mmap_data)
 1893                         atomic_dec(&nr_mmap_events);
 1894                 if (event->attr.comm)
 1895                         atomic_dec(&nr_comm_events);
 1896                 if (event->attr.task)
 1897                         atomic_dec(&nr_task_events);
 1898         }
 1899 
 1900         if (event->buffer) {
 1901                 perf_buffer_put(event->buffer);
 1902                 event->buffer = NULL;
 1903         }
 1904 
 1905         if (event->destroy)
 1906                 event->destroy(event);
 1907 
 1908         put_ctx(event->ctx);
 1909         call_rcu(&event->rcu_head, free_event_rcu);
 1910 }
 1911 
 1912 int perf_event_release_kernel(struct perf_event *event)
 1913 {
 1914         struct perf_event_context *ctx = event->ctx;
 1915 
 1916         /*
 1917          * Remove from the PMU, can't get re-enabled since we got
 1918          * here because the last ref went.
 1919          */
 1920         perf_event_disable(event);
 1921 
 1922         WARN_ON_ONCE(ctx->parent_ctx);
 1923         /*
 1924          * There are two ways this annotation is useful:
 1925          *
 1926          *  1) there is a lock recursion from perf_event_exit_task
 1927          *     see the comment there.
 1928          *
 1929          *  2) there is a lock-inversion with mmap_sem through
 1930          *     perf_event_read_group(), which takes faults while
 1931          *     holding ctx->mutex, however this is called after
 1932          *     the last filedesc died, so there is no possibility
 1933          *     to trigger the AB-BA case.
 1934          */
 1935         mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
 1936         raw_spin_lock_irq(&ctx->lock);
 1937         perf_group_detach(event);
 1938         list_del_event(event, ctx);
 1939         raw_spin_unlock_irq(&ctx->lock);
 1940         mutex_unlock(&ctx->mutex);
 1941 
 1942         mutex_lock(&event->owner->perf_event_mutex);
 1943         list_del_init(&event->owner_entry);
 1944         mutex_unlock(&event->owner->perf_event_mutex);
 1945         put_task_struct(event->owner);
 1946 
 1947         free_event(event);
 1948 
 1949         return 0;
 1950 }
 1951 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
 1952 
 1953 /*
 1954  * Called when the last reference to the file is gone.
 1955  */
 1956 static int perf_release(struct inode *inode, struct file *file)
 1957 {
 1958         struct perf_event *event = file->private_data;
 1959 
 1960         file->private_data = NULL;
 1961 
 1962         return perf_event_release_kernel(event);
 1963 }
 1964 
 1965 static int perf_event_read_size(struct perf_event *event)
 1966 {
 1967         int entry = sizeof(u64); /* value */
 1968         int size = 0;
 1969         int nr = 1;
 1970 
 1971         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 1972                 size += sizeof(u64);
 1973 
 1974         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 1975                 size += sizeof(u64);
 1976 
 1977         if (event->attr.read_format & PERF_FORMAT_ID)
 1978                 entry += sizeof(u64);
 1979 
 1980         if (event->attr.read_format & PERF_FORMAT_GROUP) {
 1981                 nr += event->group_leader->nr_siblings;
 1982                 size += sizeof(u64);
 1983         }
 1984 
 1985         size += entry * nr;
 1986 
 1987         return size;
 1988 }
 1989 
 1990 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 1991 {
 1992         struct perf_event *child;
 1993         u64 total = 0;
 1994 
 1995         *enabled = 0;
 1996         *running = 0;
 1997 
 1998         mutex_lock(&event->child_mutex);
 1999         total += perf_event_read(event);
 2000         *enabled += event->total_time_enabled +
 2001                         atomic64_read(&event->child_total_time_enabled);
 2002         *running += event->total_time_running +
 2003                         atomic64_read(&event->child_total_time_running);
 2004 
 2005         list_for_each_entry(child, &event->child_list, child_list) {
 2006                 total += perf_event_read(child);
 2007                 *enabled += child->total_time_enabled;
 2008                 *running += child->total_time_running;
 2009         }
 2010         mutex_unlock(&event->child_mutex);
 2011 
 2012         return total;
 2013 }
 2014 EXPORT_SYMBOL_GPL(perf_event_read_value);
 2015 
 2016 static int perf_event_read_group(struct perf_event *event,
 2017                                    u64 read_format, char __user *buf)
 2018 {
 2019         struct perf_event *leader = event->group_leader, *sub;
 2020         int n = 0, size = 0, ret = -EFAULT;
 2021         struct perf_event_context *ctx = leader->ctx;
 2022         u64 values[5];
 2023         u64 count, enabled, running;
 2024 
 2025         mutex_lock(&ctx->mutex);
 2026         count = perf_event_read_value(leader, &enabled, &running);
 2027 
 2028         values[n++] = 1 + leader->nr_siblings;
 2029         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 2030                 values[n++] = enabled;
 2031         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 2032                 values[n++] = running;
 2033         values[n++] = count;
 2034         if (read_format & PERF_FORMAT_ID)
 2035                 values[n++] = primary_event_id(leader);
 2036 
 2037         size = n * sizeof(u64);
 2038 
 2039         if (copy_to_user(buf, values, size))
 2040                 goto unlock;
 2041 
 2042         ret = size;
 2043 
 2044         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 2045                 n = 0;
 2046 
 2047                 values[n++] = perf_event_read_value(sub, &enabled, &running);
 2048                 if (read_format & PERF_FORMAT_ID)
 2049                         values[n++] = primary_event_id(sub);
 2050 
 2051                 size = n * sizeof(u64);
 2052 
 2053                 if (copy_to_user(buf + ret, values, size)) {
 2054                         ret = -EFAULT;
 2055                         goto unlock;
 2056                 }
 2057 
 2058                 ret += size;
 2059         }
 2060 unlock:
 2061         mutex_unlock(&ctx->mutex);
 2062 
 2063         return ret;
 2064 }
 2065 
 2066 static int perf_event_read_one(struct perf_event *event,
 2067                                  u64 read_format, char __user *buf)
 2068 {
 2069         u64 enabled, running;
 2070         u64 values[4];
 2071         int n = 0;
 2072 
 2073         values[n++] = perf_event_read_value(event, &enabled, &running);
 2074         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 2075                 values[n++] = enabled;
 2076         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 2077                 values[n++] = running;
 2078         if (read_format & PERF_FORMAT_ID)
 2079                 values[n++] = primary_event_id(event);
 2080 
 2081         if (copy_to_user(buf, values, n * sizeof(u64)))
 2082                 return -EFAULT;
 2083 
 2084         return n * sizeof(u64);
 2085 }
 2086 
 2087 /*
 2088  * Read the performance event - simple non blocking version for now
 2089  */
 2090 static ssize_t
 2091 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
 2092 {
 2093         u64 read_format = event->attr.read_format;
 2094         int ret;
 2095 
 2096         /*
 2097          * Return end-of-file for a read on a event that is in
 2098          * error state (i.e. because it was pinned but it couldn't be
 2099          * scheduled on to the CPU at some point).
 2100          */
 2101         if (event->state == PERF_EVENT_STATE_ERROR)
 2102                 return 0;
 2103 
 2104         if (count < perf_event_read_size(event))
 2105                 return -ENOSPC;
 2106 
 2107         WARN_ON_ONCE(event->ctx->parent_ctx);
 2108         if (read_format & PERF_FORMAT_GROUP)
 2109                 ret = perf_event_read_group(event, read_format, buf);
 2110         else
 2111                 ret = perf_event_read_one(event, read_format, buf);
 2112 
 2113         return ret;
 2114 }
 2115 
 2116 static ssize_t
 2117 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 2118 {
 2119         struct perf_event *event = file->private_data;
 2120 
 2121         return perf_read_hw(event, buf, count);
 2122 }
 2123 
 2124 static unsigned int perf_poll(struct file *file, poll_table *wait)
 2125 {
 2126         struct perf_event *event = file->private_data;
 2127         struct perf_buffer *buffer;
 2128         unsigned int events = POLL_HUP;
 2129 
 2130         rcu_read_lock();
 2131         buffer = rcu_dereference(event->buffer);
 2132         if (buffer)
 2133                 events = atomic_xchg(&buffer->poll, 0);
 2134         rcu_read_unlock();
 2135 
 2136         poll_wait(file, &event->waitq, wait);
 2137 
 2138         return events;
 2139 }
 2140 
 2141 static void perf_event_reset(struct perf_event *event)
 2142 {
 2143         (void)perf_event_read(event);
 2144         local64_set(&event->count, 0);
 2145         perf_event_update_userpage(event);
 2146 }
 2147 
 2148 /*
 2149  * Holding the top-level event's child_mutex means that any
 2150  * descendant process that has inherited this event will block
 2151  * in sync_child_event if it goes to exit, thus satisfying the
 2152  * task existence requirements of perf_event_enable/disable.
 2153  */
 2154 static void perf_event_for_each_child(struct perf_event *event,
 2155                                         void (*func)(struct perf_event *))
 2156 {
 2157         struct perf_event *child;
 2158 
 2159         WARN_ON_ONCE(event->ctx->parent_ctx);
 2160         mutex_lock(&event->child_mutex);
 2161         func(event);
 2162         list_for_each_entry(child, &event->child_list, child_list)
 2163                 func(child);
 2164         mutex_unlock(&event->child_mutex);
 2165 }
 2166 
 2167 static void perf_event_for_each(struct perf_event *event,
 2168                                   void (*func)(struct perf_event *))
 2169 {
 2170         struct perf_event_context *ctx = event->ctx;
 2171         struct perf_event *sibling;
 2172 
 2173         WARN_ON_ONCE(ctx->parent_ctx);
 2174         mutex_lock(&ctx->mutex);
 2175         event = event->group_leader;
 2176 
 2177         perf_event_for_each_child(event, func);
 2178         func(event);
 2179         list_for_each_entry(sibling, &event->sibling_list, group_entry)
 2180                 perf_event_for_each_child(event, func);
 2181         mutex_unlock(&ctx->mutex);
 2182 }
 2183 
 2184 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 2185 {
 2186         struct perf_event_context *ctx = event->ctx;
 2187         unsigned long size;
 2188         int ret = 0;
 2189         u64 value;
 2190 
 2191         if (!event->attr.sample_period)
 2192                 return -EINVAL;
 2193 
 2194         size = copy_from_user(&value, arg, sizeof(value));
 2195         if (size != sizeof(value))
 2196                 return -EFAULT;
 2197 
 2198         if (!value)
 2199                 return -EINVAL;
 2200 
 2201         raw_spin_lock_irq(&ctx->lock);
 2202         if (event->attr.freq) {
 2203                 if (value > sysctl_perf_event_sample_rate) {
 2204                         ret = -EINVAL;
 2205                         goto unlock;
 2206                 }
 2207 
 2208                 event->attr.sample_freq = value;
 2209         } else {
 2210                 event->attr.sample_period = value;
 2211                 event->hw.sample_period = value;
 2212         }
 2213 unlock:
 2214         raw_spin_unlock_irq(&ctx->lock);
 2215 
 2216         return ret;
 2217 }
 2218 
 2219 static const struct file_operations perf_fops;
 2220 
 2221 static struct perf_event *perf_fget_light(int fd, int *fput_needed)
 2222 {
 2223         struct file *file;
 2224 
 2225         file = fget_light(fd, fput_needed);
 2226         if (!file)
 2227                 return ERR_PTR(-EBADF);
 2228 
 2229         if (file->f_op != &perf_fops) {
 2230                 fput_light(file, *fput_needed);
 2231                 *fput_needed = 0;
 2232                 return ERR_PTR(-EBADF);
 2233         }
 2234 
 2235         return file->private_data;
 2236 }
 2237 
 2238 static int perf_event_set_output(struct perf_event *event,
 2239                                  struct perf_event *output_event);
 2240 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 2241 
 2242 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 2243 {
 2244         struct perf_event *event = file->private_data;
 2245         void (*func)(struct perf_event *);
 2246         u32 flags = arg;
 2247 
 2248         switch (cmd) {
 2249         case PERF_EVENT_IOC_ENABLE:
 2250                 func = perf_event_enable;
 2251                 break;
 2252         case PERF_EVENT_IOC_DISABLE:
 2253                 func = perf_event_disable;
 2254                 break;
 2255         case PERF_EVENT_IOC_RESET:
 2256                 func = perf_event_reset;
 2257                 break;
 2258 
 2259         case PERF_EVENT_IOC_REFRESH:
 2260                 return perf_event_refresh(event, arg);
 2261 
 2262         case PERF_EVENT_IOC_PERIOD:
 2263                 return perf_event_period(event, (u64 __user *)arg);
 2264 
 2265         case PERF_EVENT_IOC_SET_OUTPUT:
 2266         {
 2267                 struct perf_event *output_event = NULL;
 2268                 int fput_needed = 0;
 2269                 int ret;
 2270 
 2271                 if (arg != -1) {
 2272                         output_event = perf_fget_light(arg, &fput_needed);
 2273                         if (IS_ERR(output_event))
 2274                                 return PTR_ERR(output_event);
 2275                 }
 2276 
 2277                 ret = perf_event_set_output(event, output_event);
 2278                 if (output_event)
 2279                         fput_light(output_event->filp, fput_needed);
 2280 
 2281                 return ret;
 2282         }
 2283 
 2284         case PERF_EVENT_IOC_SET_FILTER:
 2285                 return perf_event_set_filter(event, (void __user *)arg);
 2286 
 2287         default:
 2288                 return -ENOTTY;
 2289         }
 2290 
 2291         if (flags & PERF_IOC_FLAG_GROUP)
 2292                 perf_event_for_each(event, func);
 2293         else
 2294                 perf_event_for_each_child(event, func);
 2295 
 2296         return 0;
 2297 }
 2298 
 2299 int perf_event_task_enable(void)
 2300 {
 2301         struct perf_event *event;
 2302 
 2303         mutex_lock(&current->perf_event_mutex);
 2304         list_for_each_entry(event, &current->perf_event_list, owner_entry)
 2305                 perf_event_for_each_child(event, perf_event_enable);
 2306         mutex_unlock(&current->perf_event_mutex);
 2307 
 2308         return 0;
 2309 }
 2310 
 2311 int perf_event_task_disable(void)
 2312 {
 2313         struct perf_event *event;
 2314 
 2315         mutex_lock(&current->perf_event_mutex);
 2316         list_for_each_entry(event, &current->perf_event_list, owner_entry)
 2317                 perf_event_for_each_child(event, perf_event_disable);
 2318         mutex_unlock(&current->perf_event_mutex);
 2319 
 2320         return 0;
 2321 }
 2322 
 2323 #ifndef PERF_EVENT_INDEX_OFFSET
 2324 # define PERF_EVENT_INDEX_OFFSET 0
 2325 #endif
 2326 
 2327 static int perf_event_index(struct perf_event *event)
 2328 {
 2329         if (event->state != PERF_EVENT_STATE_ACTIVE)
 2330                 return 0;
 2331 
 2332         return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
 2333 }
 2334 
 2335 /*
 2336  * Callers need to ensure there can be no nesting of this function, otherwise
 2337  * the seqlock logic goes bad. We can not serialize this because the arch
 2338  * code calls this from NMI context.
 2339  */
 2340 void perf_event_update_userpage(struct perf_event *event)
 2341 {
 2342         struct perf_event_mmap_page *userpg;
 2343         struct perf_buffer *buffer;
 2344 
 2345         rcu_read_lock();
 2346         buffer = rcu_dereference(event->buffer);
 2347         if (!buffer)
 2348                 goto unlock;
 2349 
 2350         userpg = buffer->user_page;
 2351 
 2352         /*
 2353          * Disable preemption so as to not let the corresponding user-space
 2354          * spin too long if we get preempted.
 2355          */
 2356         preempt_disable();
 2357         ++userpg->lock;
 2358         barrier();
 2359         userpg->index = perf_event_index(event);
 2360         userpg->offset = perf_event_count(event);
 2361         if (event->state == PERF_EVENT_STATE_ACTIVE)
 2362                 userpg->offset -= local64_read(&event->hw.prev_count);
 2363 
 2364         userpg->time_enabled = event->total_time_enabled +
 2365                         atomic64_read(&event->child_total_time_enabled);
 2366 
 2367         userpg->time_running = event->total_time_running +
 2368                         atomic64_read(&event->child_total_time_running);
 2369 
 2370         barrier();
 2371         ++userpg->lock;
 2372         preempt_enable();
 2373 unlock:
 2374         rcu_read_unlock();
 2375 }
 2376 
 2377 static unsigned long perf_data_size(struct perf_buffer *buffer);
 2378 
 2379 static void
 2380 perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
 2381 {
 2382         long max_size = perf_data_size(buffer);
 2383 
 2384         if (watermark)
 2385                 buffer->watermark = min(max_size, watermark);
 2386 
 2387         if (!buffer->watermark)
 2388                 buffer->watermark = max_size / 2;
 2389 
 2390         if (flags & PERF_BUFFER_WRITABLE)
 2391                 buffer->writable = 1;
 2392 
 2393         atomic_set(&buffer->refcount, 1);
 2394 }
 2395 
 2396 #ifndef CONFIG_PERF_USE_VMALLOC
 2397 
 2398 /*
 2399  * Back perf_mmap() with regular GFP_KERNEL-0 pages.
 2400  */
 2401 
 2402 static struct page *
 2403 perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 2404 {
 2405         if (pgoff > buffer->nr_pages)
 2406                 return NULL;
 2407 
 2408         if (pgoff == 0)
 2409                 return virt_to_page(buffer->user_page);
 2410 
 2411         return virt_to_page(buffer->data_pages[pgoff - 1]);
 2412 }
 2413 
 2414 static void *perf_mmap_alloc_page(int cpu)
 2415 {
 2416         struct page *page;
 2417         int node;
 2418 
 2419         node = (cpu == -1) ? cpu : cpu_to_node(cpu);
 2420         page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
 2421         if (!page)
 2422                 return NULL;
 2423 
 2424         return page_address(page);
 2425 }
 2426 
 2427 static struct perf_buffer *
 2428 perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 2429 {
 2430         struct perf_buffer *buffer;
 2431         unsigned long size;
 2432         int i;
 2433 
 2434         size = sizeof(struct perf_buffer);
 2435         size += nr_pages * sizeof(void *);
 2436 
 2437         buffer = kzalloc(size, GFP_KERNEL);
 2438         if (!buffer)
 2439                 goto fail;
 2440 
 2441         buffer->user_page = perf_mmap_alloc_page(cpu);
 2442         if (!buffer->user_page)
 2443                 goto fail_user_page;
 2444 
 2445         for (i = 0; i < nr_pages; i++) {
 2446                 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
 2447                 if (!buffer->data_pages[i])
 2448                         goto fail_data_pages;
 2449         }
 2450 
 2451         buffer->nr_pages = nr_pages;
 2452 
 2453         perf_buffer_init(buffer, watermark, flags);
 2454 
 2455         return buffer;
 2456 
 2457 fail_data_pages:
 2458         for (i--; i >= 0; i--)
 2459                 free_page((unsigned long)buffer->data_pages[i]);
 2460 
 2461         free_page((unsigned long)buffer->user_page);
 2462 
 2463 fail_user_page:
 2464         kfree(buffer);
 2465 
 2466 fail:
 2467         return NULL;
 2468 }
 2469 
 2470 static void perf_mmap_free_page(unsigned long addr)
 2471 {
 2472         struct page *page = virt_to_page((void *)addr);
 2473 
 2474         page->mapping = NULL;
 2475         __free_page(page);
 2476 }
 2477 
 2478 static void perf_buffer_free(struct perf_buffer *buffer)
 2479 {
 2480         int i;
 2481 
 2482         perf_mmap_free_page((unsigned long)buffer->user_page);
 2483         for (i = 0; i < buffer->nr_pages; i++)
 2484                 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
 2485         kfree(buffer);
 2486 }
 2487 
 2488 static inline int page_order(struct perf_buffer *buffer)
 2489 {
 2490         return 0;
 2491 }
 2492 
 2493 #else
 2494 
 2495 /*
 2496  * Back perf_mmap() with vmalloc memory.
 2497  *
 2498  * Required for architectures that have d-cache aliasing issues.
 2499  */
 2500 
 2501 static inline int page_order(struct perf_buffer *buffer)
 2502 {
 2503         return buffer->page_order;
 2504 }
 2505 
 2506 static struct page *
 2507 perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
 2508 {
 2509         if (pgoff > (1UL << page_order(buffer)))
 2510                 return NULL;
 2511 
 2512         return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
 2513 }
 2514 
 2515 static void perf_mmap_unmark_page(void *addr)
 2516 {
 2517         struct page *page = vmalloc_to_page(addr);
 2518 
 2519         page->mapping = NULL;
 2520 }
 2521 
 2522 static void perf_buffer_free_work(struct work_struct *work)
 2523 {
 2524         struct perf_buffer *buffer;
 2525         void *base;
 2526         int i, nr;
 2527 
 2528         buffer = container_of(work, struct perf_buffer, work);
 2529         nr = 1 << page_order(buffer);
 2530 
 2531         base = buffer->user_page;
 2532         for (i = 0; i < nr + 1; i++)
 2533                 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
 2534 
 2535         vfree(base);
 2536         kfree(buffer);
 2537 }
 2538 
 2539 static void perf_buffer_free(struct perf_buffer *buffer)
 2540 {
 2541         schedule_work(&buffer->work);
 2542 }
 2543 
 2544 static struct perf_buffer *
 2545 perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
 2546 {
 2547         struct perf_buffer *buffer;
 2548         unsigned long size;
 2549         void *all_buf;
 2550 
 2551         size = sizeof(struct perf_buffer);
 2552         size += sizeof(void *);
 2553 
 2554         buffer = kzalloc(size, GFP_KERNEL);
 2555         if (!buffer)
 2556                 goto fail;
 2557 
 2558         INIT_WORK(&buffer->work, perf_buffer_free_work);
 2559 
 2560         all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
 2561         if (!all_buf)
 2562                 goto fail_all_buf;
 2563 
 2564         buffer->user_page = all_buf;
 2565         buffer->data_pages[0] = all_buf + PAGE_SIZE;
 2566         buffer->page_order = ilog2(nr_pages);
 2567         buffer->nr_pages = 1;
 2568 
 2569         perf_buffer_init(buffer, watermark, flags);
 2570 
 2571         return buffer;
 2572 
 2573 fail_all_buf:
 2574         kfree(buffer);
 2575 
 2576 fail:
 2577         return NULL;
 2578 }
 2579 
 2580 #endif
 2581 
 2582 static unsigned long perf_data_size(struct perf_buffer *buffer)
 2583 {
 2584         return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
 2585 }
 2586 
 2587 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 2588 {
 2589         struct perf_event *event = vma->vm_file->private_data;
 2590         struct perf_buffer *buffer;
 2591         int ret = VM_FAULT_SIGBUS;
 2592 
 2593         if (vmf->flags & FAULT_FLAG_MKWRITE) {
 2594                 if (vmf->pgoff == 0)
 2595                         ret = 0;
 2596                 return ret;
 2597         }
 2598 
 2599         rcu_read_lock();
 2600         buffer = rcu_dereference(event->buffer);
 2601         if (!buffer)
 2602                 goto unlock;
 2603 
 2604         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
 2605                 goto unlock;
 2606 
 2607         vmf->page = perf_mmap_to_page(buffer, vmf->pgoff);
 2608         if (!vmf->page)
 2609                 goto unlock;
 2610 
 2611         get_page(vmf->page);
 2612         vmf->page->mapping = vma->vm_file->f_mapping;
 2613         vmf->page->index   = vmf->pgoff;
 2614 
 2615         ret = 0;
 2616 unlock:
 2617         rcu_read_unlock();
 2618 
 2619         return ret;
 2620 }
 2621 
 2622 static void perf_buffer_free_rcu(struct rcu_head *rcu_head)
 2623 {
 2624         struct perf_buffer *buffer;
 2625 
 2626         buffer = container_of(rcu_head, struct perf_buffer, rcu_head);
 2627         perf_buffer_free(buffer);
 2628 }
 2629 
 2630 static struct perf_buffer *perf_buffer_get(struct perf_event *event)
 2631 {
 2632         struct perf_buffer *buffer;
 2633 
 2634         rcu_read_lock();
 2635         buffer = rcu_dereference(event->buffer);
 2636         if (buffer) {
 2637                 if (!atomic_inc_not_zero(&buffer->refcount))
 2638                         buffer = NULL;
 2639         }
 2640         rcu_read_unlock();
 2641 
 2642         return buffer;
 2643 }
 2644 
 2645 static void perf_buffer_put(struct perf_buffer *buffer)
 2646 {
 2647         if (!atomic_dec_and_test(&buffer->refcount))
 2648                 return;
 2649 
 2650         call_rcu(&buffer->rcu_head, perf_buffer_free_rcu);
 2651 }
 2652 
 2653 static void perf_mmap_open(struct vm_area_struct *vma)
 2654 {
 2655         struct perf_event *event = vma->vm_file->private_data;
 2656 
 2657         atomic_inc(&event->mmap_count);
 2658 }
 2659 
 2660 static void perf_mmap_close(struct vm_area_struct *vma)
 2661 {
 2662         struct perf_event *event = vma->vm_file->private_data;
 2663 
 2664         if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
 2665                 unsigned long size = perf_data_size(event->buffer);
 2666                 struct user_struct *user = event->mmap_user;
 2667                 struct perf_buffer *buffer = event->buffer;
 2668 
 2669                 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
 2670                 vma->vm_mm->locked_vm -= event->mmap_locked;
 2671                 rcu_assign_pointer(event->buffer, NULL);
 2672                 mutex_unlock(&event->mmap_mutex);
 2673 
 2674                 perf_buffer_put(buffer);
 2675                 free_uid(user);
 2676         }
 2677 }
 2678 
 2679 static const struct vm_operations_struct perf_mmap_vmops = {
 2680         .open           = perf_mmap_open,
 2681         .close          = perf_mmap_close,
 2682         .fault          = perf_mmap_fault,
 2683         .page_mkwrite   = perf_mmap_fault,
 2684 };
 2685 
 2686 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 2687 {
 2688         struct perf_event *event = file->private_data;
 2689         unsigned long user_locked, user_lock_limit;
 2690         struct user_struct *user = current_user();
 2691         unsigned long locked, lock_limit;
 2692         struct perf_buffer *buffer;
 2693         unsigned long vma_size;
 2694         unsigned long nr_pages;
 2695         long user_extra, extra;
 2696         int ret = 0, flags = 0;
 2697 
 2698         /*
 2699          * Don't allow mmap() of inherited per-task counters. This would
 2700          * create a performance issue due to all children writing to the
 2701          * same buffer.
 2702          */
 2703         if (event->cpu == -1 && event->attr.inherit)
 2704                 return -EINVAL;
 2705 
 2706         if (!(vma->vm_flags & VM_SHARED))
 2707                 return -EINVAL;
 2708 
 2709         vma_size = vma->vm_end - vma->vm_start;
 2710         nr_pages = (vma_size / PAGE_SIZE) - 1;
 2711 
 2712         /*
 2713          * If we have buffer pages ensure they're a power-of-two number, so we
 2714          * can do bitmasks instead of modulo.
 2715          */
 2716         if (nr_pages != 0 && !is_power_of_2(nr_pages))
 2717                 return -EINVAL;
 2718 
 2719         if (vma_size != PAGE_SIZE * (1 + nr_pages))
 2720                 return -EINVAL;
 2721 
 2722         if (vma->vm_pgoff != 0)
 2723                 return -EINVAL;
 2724 
 2725         WARN_ON_ONCE(event->ctx->parent_ctx);
 2726         mutex_lock(&event->mmap_mutex);
 2727         if (event->buffer) {
 2728                 if (event->buffer->nr_pages == nr_pages)
 2729                         atomic_inc(&event->buffer->refcount);
 2730                 else
 2731                         ret = -EINVAL;
 2732                 goto unlock;
 2733         }
 2734 
 2735         user_extra = nr_pages + 1;
 2736         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
 2737 
 2738         /*
 2739          * Increase the limit linearly with more CPUs:
 2740          */
 2741         user_lock_limit *= num_online_cpus();
 2742 
 2743         user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 2744 
 2745         extra = 0;
 2746         if (user_locked > user_lock_limit)
 2747                 extra = user_locked - user_lock_limit;
 2748 
 2749         lock_limit = rlimit(RLIMIT_MEMLOCK);
 2750         lock_limit >>= PAGE_SHIFT;
 2751         locked = vma->vm_mm->locked_vm + extra;
 2752 
 2753         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
 2754                 !capable(CAP_IPC_LOCK)) {
 2755                 ret = -EPERM;
 2756                 goto unlock;
 2757         }
 2758 
 2759         WARN_ON(event->buffer);
 2760 
 2761         if (vma->vm_flags & VM_WRITE)
 2762                 flags |= PERF_BUFFER_WRITABLE;
 2763 
 2764         buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark,
 2765                                    event->cpu, flags);
 2766         if (!buffer) {
 2767                 ret = -ENOMEM;
 2768                 goto unlock;
 2769         }
 2770         rcu_assign_pointer(event->buffer, buffer);
 2771 
 2772         atomic_long_add(user_extra, &user->locked_vm);
 2773         event->mmap_locked = extra;
 2774         event->mmap_user = get_current_user();
 2775         vma->vm_mm->locked_vm += event->mmap_locked;
 2776 
 2777 unlock:
 2778         if (!ret)
 2779                 atomic_inc(&event->mmap_count);
 2780         mutex_unlock(&event->mmap_mutex);
 2781 
 2782         vma->vm_flags |= VM_RESERVED;
 2783         vma->vm_ops = &perf_mmap_vmops;
 2784 
 2785         return ret;
 2786 }
 2787 
 2788 static int perf_fasync(int fd, struct file *filp, int on)
 2789 {
 2790         struct inode *inode = filp->f_path.dentry->d_inode;
 2791         struct perf_event *event = filp->private_data;
 2792         int retval;
 2793 
 2794         mutex_lock(&inode->i_mutex);
 2795         retval = fasync_helper(fd, filp, on, &event->fasync);
 2796         mutex_unlock(&inode->i_mutex);
 2797 
 2798         if (retval < 0)
 2799                 return retval;
 2800 
 2801         return 0;
 2802 }
 2803 
 2804 static const struct file_operations perf_fops = {
 2805         .llseek                 = no_llseek,
 2806         .release                = perf_release,
 2807         .read                   = perf_read,
 2808         .poll                   = perf_poll,
 2809         .unlocked_ioctl         = perf_ioctl,
 2810         .compat_ioctl           = perf_ioctl,
 2811         .mmap                   = perf_mmap,
 2812         .fasync                 = perf_fasync,
 2813 };
 2814 
 2815 /*
 2816  * Perf event wakeup
 2817  *
 2818  * If there's data, ensure we set the poll() state and publish everything
 2819  * to user-space before waking everybody up.
 2820  */
 2821 
 2822 void perf_event_wakeup(struct perf_event *event)
 2823 {
 2824         wake_up_all(&event->waitq);
 2825 
 2826         if (event->pending_kill) {
 2827                 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
 2828                 event->pending_kill = 0;
 2829         }
 2830 }
 2831 
 2832 /*
 2833  * Pending wakeups
 2834  *
 2835  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
 2836  *
 2837  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
 2838  * single linked list and use cmpxchg() to add entries lockless.
 2839  */
 2840 
 2841 static void perf_pending_event(struct perf_pending_entry *entry)
 2842 {
 2843         struct perf_event *event = container_of(entry,
 2844                         struct perf_event, pending);
 2845 
 2846         if (event->pending_disable) {
 2847                 event->pending_disable = 0;
 2848                 __perf_event_disable(event);
 2849         }
 2850 
 2851         if (event->pending_wakeup) {
 2852                 event->pending_wakeup = 0;
 2853                 perf_event_wakeup(event);
 2854         }
 2855 }
 2856 
 2857 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 2858 
 2859 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
 2860         PENDING_TAIL,
 2861 };
 2862 
 2863 static void perf_pending_queue(struct perf_pending_entry *entry,
 2864                                void (*func)(struct perf_pending_entry *))
 2865 {
 2866         struct perf_pending_entry **head;
 2867 
 2868         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
 2869                 return;
 2870 
 2871         entry->func = func;
 2872 
 2873         head = &get_cpu_var(perf_pending_head);
 2874 
 2875         do {
 2876                 entry->next = *head;
 2877         } while (cmpxchg(head, entry->next, entry) != entry->next);
 2878 
 2879         set_perf_event_pending();
 2880 
 2881         put_cpu_var(perf_pending_head);
 2882 }
 2883 
 2884 static int __perf_pending_run(void)
 2885 {
 2886         struct perf_pending_entry *list;
 2887         int nr = 0;
 2888 
 2889         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
 2890         while (list != PENDING_TAIL) {
 2891                 void (*func)(struct perf_pending_entry *);
 2892                 struct perf_pending_entry *entry = list;
 2893 
 2894                 list = list->next;
 2895 
 2896                 func = entry->func;
 2897                 entry->next = NULL;
 2898                 /*
 2899                  * Ensure we observe the unqueue before we issue the wakeup,
 2900                  * so that we won't be waiting forever.
 2901                  * -- see perf_not_pending().
 2902                  */
 2903                 smp_wmb();
 2904 
 2905                 func(entry);
 2906                 nr++;
 2907         }
 2908 
 2909         return nr;
 2910 }
 2911 
 2912 static inline int perf_not_pending(struct perf_event *event)
 2913 {
 2914         /*
 2915          * If we flush on whatever cpu we run, there is a chance we don't
 2916          * need to wait.
 2917          */
 2918         get_cpu();
 2919         __perf_pending_run();
 2920         put_cpu();
 2921 
 2922         /*
 2923          * Ensure we see the proper queue state before going to sleep
 2924          * so that we do not miss the wakeup. -- see perf_pending_handle()
 2925          */
 2926         smp_rmb();
 2927         return event->pending.next == NULL;
 2928 }
 2929 
 2930 static void perf_pending_sync(struct perf_event *event)
 2931 {
 2932         wait_event(event->waitq, perf_not_pending(event));
 2933 }
 2934 
 2935 void perf_event_do_pending(void)
 2936 {
 2937         __perf_pending_run();
 2938 }
 2939 
 2940 /*
 2941  * Callchain support -- arch specific
 2942  */
 2943 
 2944 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 2945 {
 2946         return NULL;
 2947 }
 2948 
 2949 
 2950 /*
 2951  * We assume there is only KVM supporting the callbacks.
 2952  * Later on, we might change it to a list if there is
 2953  * another virtualization implementation supporting the callbacks.
 2954  */
 2955 struct perf_guest_info_callbacks *perf_guest_cbs;
 2956 
 2957 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 2958 {
 2959         perf_guest_cbs = cbs;
 2960         return 0;
 2961 }
 2962 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
 2963 
 2964 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
 2965 {
 2966         perf_guest_cbs = NULL;
 2967         return 0;
 2968 }
 2969 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
 2970 
 2971 /*
 2972  * Output
 2973  */
 2974 static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
 2975                               unsigned long offset, unsigned long head)
 2976 {
 2977         unsigned long mask;
 2978 
 2979         if (!buffer->writable)
 2980                 return true;
 2981 
 2982         mask = perf_data_size(buffer) - 1;
 2983 
 2984         offset = (offset - tail) & mask;
 2985         head   = (head   - tail) & mask;
 2986 
 2987         if ((int)(head - offset) < 0)
 2988                 return false;
 2989 
 2990         return true;
 2991 }
 2992 
 2993 static void perf_output_wakeup(struct perf_output_handle *handle)
 2994 {
 2995         atomic_set(&handle->buffer->poll, POLL_IN);
 2996 
 2997         if (handle->nmi) {
 2998                 handle->event->pending_wakeup = 1;
 2999                 perf_pending_queue(&handle->event->pending,
 3000                                    perf_pending_event);
 3001         } else
 3002                 perf_event_wakeup(handle->event);
 3003 }
 3004 
 3005 /*
 3006  * We need to ensure a later event_id doesn't publish a head when a former
 3007  * event isn't done writing. However since we need to deal with NMIs we
 3008  * cannot fully serialize things.
 3009  *
 3010  * We only publish the head (and generate a wakeup) when the outer-most
 3011  * event completes.
 3012  */
 3013 static void perf_output_get_handle(struct perf_output_handle *handle)
 3014 {
 3015         struct perf_buffer *buffer = handle->buffer;
 3016 
 3017         preempt_disable();
 3018         local_inc(&buffer->nest);
 3019         handle->wakeup = local_read(&buffer->wakeup);
 3020 }
 3021 
 3022 static void perf_output_put_handle(struct perf_output_handle *handle)
 3023 {
 3024         struct perf_buffer *buffer = handle->buffer;
 3025         unsigned long head;
 3026 
 3027 again:
 3028         head = local_read(&buffer->head);
 3029 
 3030         /*
 3031          * IRQ/NMI can happen here, which means we can miss a head update.
 3032          */
 3033 
 3034         if (!local_dec_and_test(&buffer->nest))
 3035                 goto out;
 3036 
 3037         /*
 3038          * Publish the known good head. Rely on the full barrier implied
 3039          * by atomic_dec_and_test() order the buffer->head read and this
 3040          * write.
 3041          */
 3042         buffer->user_page->data_head = head;
 3043 
 3044         /*
 3045          * Now check if we missed an update, rely on the (compiler)
 3046          * barrier in atomic_dec_and_test() to re-read buffer->head.
 3047          */
 3048         if (unlikely(head != local_read(&buffer->head))) {
 3049                 local_inc(&buffer->nest);
 3050                 goto again;
 3051         }
 3052 
 3053         if (handle->wakeup != local_read(&buffer->wakeup))
 3054                 perf_output_wakeup(handle);
 3055 
 3056  out:
 3057         preempt_enable();
 3058 }
 3059 
 3060 __always_inline void perf_output_copy(struct perf_output_handle *handle,
 3061                       const void *buf, unsigned int len)
 3062 {
 3063         do {
 3064                 unsigned long size = min_t(unsigned long, handle->size, len);
 3065 
 3066                 memcpy(handle->addr, buf, size);
 3067 
 3068                 len -= size;
 3069                 handle->addr += size;
 3070                 buf += size;
 3071                 handle->size -= size;
 3072                 if (!handle->size) {
 3073                         struct perf_buffer *buffer = handle->buffer;
 3074 
 3075                         handle->page++;
 3076                         handle->page &= buffer->nr_pages - 1;
 3077                         handle->addr = buffer->data_pages[handle->page];
 3078                         handle->size = PAGE_SIZE << page_order(buffer);
 3079                 }
 3080         } while (len);
 3081 }
 3082 
 3083 int perf_output_begin(struct perf_output_handle *handle,
 3084                       struct perf_event *event, unsigned int size,
 3085                       int nmi, int sample)
 3086 {
 3087         struct perf_buffer *buffer;
 3088         unsigned long tail, offset, head;
 3089         int have_lost;
 3090         struct {
 3091                 struct perf_event_header header;
 3092                 u64                      id;
 3093                 u64                      lost;
 3094         } lost_event;
 3095 
 3096         rcu_read_lock();
 3097         /*
 3098          * For inherited events we send all the output towards the parent.
 3099          */
 3100         if (event->parent)
 3101                 event = event->parent;
 3102 
 3103         buffer = rcu_dereference(event->buffer);
 3104         if (!buffer)
 3105                 goto out;
 3106 
 3107         handle->buffer  = buffer;
 3108         handle->event   = event;
 3109         handle->nmi     = nmi;
 3110         handle->sample  = sample;
 3111 
 3112         if (!buffer->nr_pages)
 3113                 goto out;
 3114 
 3115         have_lost = local_read(&buffer->lost);
 3116         if (have_lost)
 3117                 size += sizeof(lost_event);
 3118 
 3119         perf_output_get_handle(handle);
 3120 
 3121         do {
 3122                 /*
 3123                  * Userspace could choose to issue a mb() before updating the
 3124                  * tail pointer. So that all reads will be completed before the
 3125                  * write is issued.
 3126                  */
 3127                 tail = ACCESS_ONCE(buffer->user_page->data_tail);
 3128                 smp_rmb();
 3129                 offset = head = local_read(&buffer->head);
 3130                 head += size;
 3131                 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
 3132                         goto fail;
 3133         } while (local_cmpxchg(&buffer->head, offset, head) != offset);
 3134 
 3135         if (head - local_read(&buffer->wakeup) > buffer->watermark)
 3136                 local_add(buffer->watermark, &buffer->wakeup);
 3137 
 3138         handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
 3139         handle->page &= buffer->nr_pages - 1;
 3140         handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
 3141         handle->addr = buffer->data_pages[handle->page];
 3142         handle->addr += handle->size;
 3143         handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
 3144 
 3145         if (have_lost) {
 3146                 lost_event.header.type = PERF_RECORD_LOST;
 3147                 lost_event.header.misc = 0;
 3148                 lost_event.header.size = sizeof(lost_event);
 3149                 lost_event.id          = event->id;
 3150                 lost_event.lost        = local_xchg(&buffer->lost, 0);
 3151 
 3152                 perf_output_put(handle, lost_event);
 3153         }
 3154 
 3155         return 0;
 3156 
 3157 fail:
 3158         local_inc(&buffer->lost);
 3159         perf_output_put_handle(handle);
 3160 out:
 3161         rcu_read_unlock();
 3162 
 3163         return -ENOSPC;
 3164 }
 3165 
 3166 void perf_output_end(struct perf_output_handle *handle)
 3167 {
 3168         struct perf_event *event = handle->event;
 3169         struct perf_buffer *buffer = handle->buffer;
 3170 
 3171         int wakeup_events = event->attr.wakeup_events;
 3172 
 3173         if (handle->sample && wakeup_events) {
 3174                 int events = local_inc_return(&buffer->events);
 3175                 if (events >= wakeup_events) {
 3176                         local_sub(wakeup_events, &buffer->events);
 3177                         local_inc(&buffer->wakeup);
 3178                 }
 3179         }
 3180 
 3181         perf_output_put_handle(handle);
 3182         rcu_read_unlock();
 3183 }
 3184 
 3185 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
 3186 {
 3187         /*
 3188          * only top level events have the pid namespace they were created in
 3189          */
 3190         if (event->parent)
 3191                 event = event->parent;
 3192 
 3193         return task_tgid_nr_ns(p, event->ns);
 3194 }
 3195 
 3196 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
 3197 {
 3198         /*
 3199          * only top level events have the pid namespace they were created in
 3200          */
 3201         if (event->parent)
 3202                 event = event->parent;
 3203 
 3204         return task_pid_nr_ns(p, event->ns);
 3205 }
 3206 
 3207 static void perf_output_read_one(struct perf_output_handle *handle,
 3208                                  struct perf_event *event)
 3209 {
 3210         u64 read_format = event->attr.read_format;
 3211         u64 values[4];
 3212         int n = 0;
 3213 
 3214         values[n++] = perf_event_count(event);
 3215         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
 3216                 values[n++] = event->total_time_enabled +
 3217                         atomic64_read(&event->child_total_time_enabled);
 3218         }
 3219         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
 3220                 values[n++] = event->total_time_running +
 3221                         atomic64_read(&event->child_total_time_running);
 3222         }
 3223         if (read_format & PERF_FORMAT_ID)
 3224                 values[n++] = primary_event_id(event);
 3225 
 3226         perf_output_copy(handle, values, n * sizeof(u64));
 3227 }
 3228 
 3229 /*
 3230  * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
 3231  */
 3232 static void perf_output_read_group(struct perf_output_handle *handle,
 3233                             struct perf_event *event)
 3234 {
 3235         struct perf_event *leader = event->group_leader, *sub;
 3236         u64 read_format = event->attr.read_format;
 3237         u64 values[5];
 3238         int n = 0;
 3239 
 3240         values[n++] = 1 + leader->nr_siblings;
 3241 
 3242         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 3243                 values[n++] = leader->total_time_enabled;
 3244 
 3245         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 3246                 values[n++] = leader->total_time_running;
 3247 
 3248         if (leader != event)
 3249                 leader->pmu->read(leader);
 3250 
 3251         values[n++] = perf_event_count(leader);
 3252         if (read_format & PERF_FORMAT_ID)
 3253                 values[n++] = primary_event_id(leader);
 3254 
 3255         perf_output_copy(handle, values, n * sizeof(u64));
 3256 
 3257         list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 3258                 n = 0;
 3259 
 3260                 if (sub != event)
 3261                         sub->pmu->read(sub);
 3262 
 3263                 values[n++] = perf_event_count(sub);
 3264                 if (read_format & PERF_FORMAT_ID)
 3265                         values[n++] = primary_event_id(sub);
 3266 
 3267                 perf_output_copy(handle, values, n * sizeof(u64));
 3268         }
 3269 }
 3270 
 3271 static void perf_output_read(struct perf_output_handle *handle,
 3272                              struct perf_event *event)
 3273 {
 3274         if (event->attr.read_format & PERF_FORMAT_GROUP)
 3275                 perf_output_read_group(handle, event);
 3276         else
 3277                 perf_output_read_one(handle, event);
 3278 }
 3279 
 3280 void perf_output_sample(struct perf_output_handle *handle,
 3281                         struct perf_event_header *header,
 3282                         struct perf_sample_data *data,
 3283                         struct perf_event *event)
 3284 {
 3285         u64 sample_type = data->type;
 3286 
 3287         perf_output_put(handle, *header);
 3288 
 3289         if (sample_type & PERF_SAMPLE_IP)
 3290                 perf_output_put(handle, data->ip);
 3291 
 3292         if (sample_type & PERF_SAMPLE_TID)
 3293                 perf_output_put(handle, data->tid_entry);
 3294 
 3295         if (sample_type & PERF_SAMPLE_TIME)
 3296                 perf_output_put(handle, data->time);
 3297 
 3298         if (sample_type & PERF_SAMPLE_ADDR)
 3299                 perf_output_put(handle, data->addr);
 3300 
 3301         if (sample_type & PERF_SAMPLE_ID)
 3302                 perf_output_put(handle, data->id);
 3303 
 3304         if (sample_type & PERF_SAMPLE_STREAM_ID)
 3305                 perf_output_put(handle, data->stream_id);
 3306 
 3307         if (sample_type & PERF_SAMPLE_CPU)
 3308                 perf_output_put(handle, data->cpu_entry);
 3309 
 3310         if (sample_type & PERF_SAMPLE_PERIOD)
 3311                 perf_output_put(handle, data->period);
 3312 
 3313         if (sample_type & PERF_SAMPLE_READ)
 3314                 perf_output_read(handle, event);
 3315 
 3316         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 3317                 if (data->callchain) {
 3318                         int size = 1;
 3319 
 3320                         if (data->callchain)
 3321                                 size += data->callchain->nr;
 3322 
 3323                         size *= sizeof(u64);
 3324 
 3325                         perf_output_copy(handle, data->callchain, size);
 3326                 } else {
 3327                         u64 nr = 0;
 3328                         perf_output_put(handle, nr);
 3329                 }
 3330         }
 3331 
 3332         if (sample_type & PERF_SAMPLE_RAW) {
 3333                 if (data->raw) {
 3334                         perf_output_put(handle, data->raw->size);
 3335                         perf_output_copy(handle, data->raw->data,
 3336                                          data->raw->size);
 3337                 } else {
 3338                         struct {
 3339                                 u32     size;
 3340                                 u32     data;
 3341                         } raw = {
 3342                                 .size = sizeof(u32),
 3343                                 .data = 0,
 3344                         };
 3345                         perf_output_put(handle, raw);
 3346                 }
 3347         }
 3348 }
 3349 
 3350 void perf_prepare_sample(struct perf_event_header *header,
 3351                          struct perf_sample_data *data,
 3352                          struct perf_event *event,
 3353                          struct pt_regs *regs)
 3354 {
 3355         u64 sample_type = event->attr.sample_type;
 3356 
 3357         data->type = sample_type;
 3358 
 3359         header->type = PERF_RECORD_SAMPLE;
 3360         header->size = sizeof(*header);
 3361 
 3362         header->misc = 0;
 3363         header->misc |= perf_misc_flags(regs);
 3364 
 3365         if (sample_type & PERF_SAMPLE_IP) {
 3366                 data->ip = perf_instruction_pointer(regs);
 3367 
 3368                 header->size += sizeof(data->ip);
 3369         }
 3370 
 3371         if (sample_type & PERF_SAMPLE_TID) {
 3372                 /* namespace issues */
 3373                 data->tid_entry.pid = perf_event_pid(event, current);
 3374                 data->tid_entry.tid = perf_event_tid(event, current);
 3375 
 3376                 header->size += sizeof(data->tid_entry);
 3377         }
 3378 
 3379         if (sample_type & PERF_SAMPLE_TIME) {
 3380                 data->time = perf_clock();
 3381 
 3382                 header->size += sizeof(data->time);
 3383         }
 3384 
 3385         if (sample_type & PERF_SAMPLE_ADDR)
 3386                 header->size += sizeof(data->addr);
 3387 
 3388         if (sample_type & PERF_SAMPLE_ID) {
 3389                 data->id = primary_event_id(event);
 3390 
 3391                 header->size += sizeof(data->id);
 3392         }
 3393 
 3394         if (sample_type & PERF_SAMPLE_STREAM_ID) {
 3395                 data->stream_id = event->id;
 3396 
 3397                 header->size += sizeof(data->stream_id);
 3398         }
 3399 
 3400         if (sample_type & PERF_SAMPLE_CPU) {
 3401                 data->cpu_entry.cpu             = raw_smp_processor_id();
 3402                 data->cpu_entry.reserved        = 0;
 3403 
 3404                 header->size += sizeof(data->cpu_entry);
 3405         }
 3406 
 3407         if (sample_type & PERF_SAMPLE_PERIOD)
 3408                 header->size += sizeof(data->period);
 3409 
 3410         if (sample_type & PERF_SAMPLE_READ)
 3411                 header->size += perf_event_read_size(event);
 3412 
 3413         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 3414                 int size = 1;
 3415 
 3416                 data->callchain = perf_callchain(regs);
 3417 
 3418                 if (data->callchain)
 3419                         size += data->callchain->nr;
 3420 
 3421                 header->size += size * sizeof(u64);
 3422         }
 3423 
 3424         if (sample_type & PERF_SAMPLE_RAW) {
 3425                 int size = sizeof(u32);
 3426 
 3427                 if (data->raw)
 3428                         size += data->raw->size;
 3429                 else
 3430                         size += sizeof(u32);
 3431 
 3432                 WARN_ON_ONCE(size & (sizeof(u64)-1));
 3433                 header->size += size;
 3434         }
 3435 }
 3436 
 3437 static void perf_event_output(struct perf_event *event, int nmi,
 3438                                 struct perf_sample_data *data,
 3439                                 struct pt_regs *regs)
 3440 {
 3441         struct perf_output_handle handle;
 3442         struct perf_event_header header;
 3443 
 3444         perf_prepare_sample(&header, data, event, regs);
 3445 
 3446         if (perf_output_begin(&handle, event, header.size, nmi, 1))
 3447                 return;
 3448 
 3449         perf_output_sample(&handle, &header, data, event);
 3450 
 3451         perf_output_end(&handle);
 3452 }
 3453 
 3454 /*
 3455  * read event_id
 3456  */
 3457 
 3458 struct perf_read_event {
 3459         struct perf_event_header        header;
 3460 
 3461         u32                             pid;
 3462         u32                             tid;
 3463 };
 3464 
 3465 static void
 3466 perf_event_read_event(struct perf_event *event,
 3467                         struct task_struct *task)
 3468 {
 3469         struct perf_output_handle handle;
 3470         struct perf_read_event read_event = {
 3471                 .header = {
 3472                         .type = PERF_RECORD_READ,
 3473                         .misc = 0,
 3474                         .size = sizeof(read_event) + perf_event_read_size(event),
 3475                 },
 3476                 .pid = perf_event_pid(event, task),
 3477                 .tid = perf_event_tid(event, task),
 3478         };
 3479         int ret;
 3480 
 3481         ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0);
 3482         if (ret)
 3483                 return;
 3484 
 3485         perf_output_put(&handle, read_event);
 3486         perf_output_read(&handle, event);
 3487 
 3488         perf_output_end(&handle);
 3489 }
 3490 
 3491 /*
 3492  * task tracking -- fork/exit
 3493  *
 3494  * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
 3495  */
 3496 
 3497 struct perf_task_event {
 3498         struct task_struct              *task;
 3499         struct perf_event_context       *task_ctx;
 3500 
 3501         struct {
 3502                 struct perf_event_header        header;
 3503 
 3504                 u32                             pid;
 3505                 u32                             ppid;
 3506                 u32                             tid;
 3507                 u32                             ptid;
 3508                 u64                             time;
 3509         } event_id;
 3510 };
 3511 
 3512 static void perf_event_task_output(struct perf_event *event,
 3513                                      struct perf_task_event *task_event)
 3514 {
 3515         struct perf_output_handle handle;
 3516         struct task_struct *task = task_event->task;
 3517         int size, ret;
 3518 
 3519         size  = task_event->event_id.header.size;
 3520         ret = perf_output_begin(&handle, event, size, 0, 0);
 3521 
 3522         if (ret)
 3523                 return;
 3524 
 3525         task_event->event_id.pid = perf_event_pid(event, task);
 3526         task_event->event_id.ppid = perf_event_pid(event, current);
 3527 
 3528         task_event->event_id.tid = perf_event_tid(event, task);
 3529         task_event->event_id.ptid = perf_event_tid(event, current);
 3530 
 3531         perf_output_put(&handle, task_event->event_id);
 3532 
 3533         perf_output_end(&handle);
 3534 }
 3535 
 3536 static int perf_event_task_match(struct perf_event *event)
 3537 {
 3538         if (event->state < PERF_EVENT_STATE_INACTIVE)
 3539                 return 0;
 3540 
 3541         if (event->cpu != -1 && event->cpu != smp_processor_id())
 3542                 return 0;
 3543 
 3544         if (event->attr.comm || event->attr.mmap ||
 3545             event->attr.mmap_data || event->attr.task)
 3546                 return 1;
 3547 
 3548         return 0;
 3549 }
 3550 
 3551 static void perf_event_task_ctx(struct perf_event_context *ctx,
 3552                                   struct perf_task_event *task_event)
 3553 {
 3554         struct perf_event *event;
 3555 
 3556         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 3557                 if (perf_event_task_match(event))
 3558                         perf_event_task_output(event, task_event);
 3559         }
 3560 }
 3561 
 3562 static void perf_event_task_event(struct perf_task_event *task_event)
 3563 {
 3564         struct perf_cpu_context *cpuctx;
 3565         struct perf_event_context *ctx = task_event->task_ctx;
 3566 
 3567         rcu_read_lock();
 3568         cpuctx = &get_cpu_var(perf_cpu_context);
 3569         perf_event_task_ctx(&cpuctx->ctx, task_event);
 3570         if (!ctx)
 3571                 ctx = rcu_dereference(current->perf_event_ctxp);
 3572         if (ctx)
 3573                 perf_event_task_ctx(ctx, task_event);
 3574         put_cpu_var(perf_cpu_context);
 3575         rcu_read_unlock();
 3576 }
 3577 
 3578 static void perf_event_task(struct task_struct *task,
 3579                               struct perf_event_context *task_ctx,
 3580                               int new)
 3581 {
 3582         struct perf_task_event task_event;
 3583 
 3584         if (!atomic_read(&nr_comm_events) &&
 3585             !atomic_read(&nr_mmap_events) &&
 3586             !atomic_read(&nr_task_events))
 3587                 return;
 3588 
 3589         task_event = (struct perf_task_event){
 3590                 .task     = task,
 3591                 .task_ctx = task_ctx,
 3592                 .event_id    = {
 3593                         .header = {
 3594                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
 3595                                 .misc = 0,
 3596                                 .size = sizeof(task_event.event_id),
 3597                         },
 3598                         /* .pid  */
 3599                         /* .ppid */
 3600                         /* .tid  */
 3601                         /* .ptid */
 3602                         .time = perf_clock(),
 3603                 },
 3604         };
 3605 
 3606         perf_event_task_event(&task_event);
 3607 }
 3608 
 3609 void perf_event_fork(struct task_struct *task)
 3610 {
 3611         perf_event_task(task, NULL, 1);
 3612 }
 3613 
 3614 /*
 3615  * comm tracking
 3616  */
 3617 
 3618 struct perf_comm_event {
 3619         struct task_struct      *task;
 3620         char                    *comm;
 3621         int                     comm_size;
 3622 
 3623         struct {
 3624                 struct perf_event_header        header;
 3625 
 3626                 u32                             pid;
 3627                 u32                             tid;
 3628         } event_id;
 3629 };
 3630 
 3631 static void perf_event_comm_output(struct perf_event *event,
 3632                                      struct perf_comm_event *comm_event)
 3633 {
 3634         struct perf_output_handle handle;
 3635         int size = comm_event->event_id.header.size;
 3636         int ret = perf_output_begin(&handle, event, size, 0, 0);
 3637 
 3638         if (ret)
 3639                 return;
 3640 
 3641         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
 3642         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
 3643 
 3644         perf_output_put(&handle, comm_event->event_id);
 3645         perf_output_copy(&handle, comm_event->comm,
 3646                                    comm_event->comm_size);
 3647         perf_output_end(&handle);
 3648 }
 3649 
 3650 static int perf_event_comm_match(struct perf_event *event)
 3651 {
 3652         if (event->state < PERF_EVENT_STATE_INACTIVE)
 3653                 return 0;
 3654 
 3655         if (event->cpu != -1 && event->cpu != smp_processor_id())
 3656                 return 0;
 3657 
 3658         if (event->attr.comm)
 3659                 return 1;
 3660 
 3661         return 0;
 3662 }
 3663 
 3664 static void perf_event_comm_ctx(struct perf_event_context *ctx,
 3665                                   struct perf_comm_event *comm_event)
 3666 {
 3667         struct perf_event *event;
 3668 
 3669         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 3670                 if (perf_event_comm_match(event))
 3671                         perf_event_comm_output(event, comm_event);
 3672         }
 3673 }
 3674 
 3675 static void perf_event_comm_event(struct perf_comm_event *comm_event)
 3676 {
 3677         struct perf_cpu_context *cpuctx;
 3678         struct perf_event_context *ctx;
 3679         unsigned int size;
 3680         char comm[TASK_COMM_LEN];
 3681 
 3682         memset(comm, 0, sizeof(comm));
 3683         strlcpy(comm, comm_event->task->comm, sizeof(comm));
 3684         size = ALIGN(strlen(comm)+1, sizeof(u64));
 3685 
 3686         comm_event->comm = comm;
 3687         comm_event->comm_size = size;
 3688 
 3689         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
 3690 
 3691         rcu_read_lock();
 3692         cpuctx = &get_cpu_var(perf_cpu_context);
 3693         perf_event_comm_ctx(&cpuctx->ctx, comm_event);
 3694         ctx = rcu_dereference(current->perf_event_ctxp);
 3695         if (ctx)
 3696                 perf_event_comm_ctx(ctx, comm_event);
 3697         put_cpu_var(perf_cpu_context);
 3698         rcu_read_unlock();
 3699 }
 3700 
 3701 void perf_event_comm(struct task_struct *task)
 3702 {
 3703         struct perf_comm_event comm_event;
 3704 
 3705         if (task->perf_event_ctxp)
 3706                 perf_event_enable_on_exec(task);
 3707 
 3708         if (!atomic_read(&nr_comm_events))
 3709                 return;
 3710 
 3711         comm_event = (struct perf_comm_event){
 3712                 .task   = task,
 3713                 /* .comm      */
 3714                 /* .comm_size */
 3715                 .event_id  = {
 3716                         .header = {
 3717                                 .type = PERF_RECORD_COMM,
 3718                                 .misc = 0,
 3719                                 /* .size */
 3720                         },
 3721                         /* .pid */
 3722                         /* .tid */
 3723                 },
 3724         };
 3725 
 3726         perf_event_comm_event(&comm_event);
 3727 }
 3728 
 3729 /*
 3730  * mmap tracking
 3731  */
 3732 
 3733 struct perf_mmap_event {
 3734         struct vm_area_struct   *vma;
 3735 
 3736         const char              *file_name;
 3737         int                     file_size;
 3738 
 3739         struct {
 3740                 struct perf_event_header        header;
 3741 
 3742                 u32                             pid;
 3743                 u32                             tid;
 3744                 u64                             start;
 3745                 u64                             len;
 3746                 u64                             pgoff;
 3747         } event_id;
 3748 };
 3749 
 3750 static void perf_event_mmap_output(struct perf_event *event,
 3751                                      struct perf_mmap_event *mmap_event)
 3752 {
 3753         struct perf_output_handle handle;
 3754         int size = mmap_event->event_id.header.size;
 3755         int ret = perf_output_begin(&handle, event, size, 0, 0);
 3756 
 3757         if (ret)
 3758                 return;
 3759 
 3760         mmap_event->event_id.pid = perf_event_pid(event, current);
 3761         mmap_event->event_id.tid = perf_event_tid(event, current);
 3762 
 3763         perf_output_put(&handle, mmap_event->event_id);
 3764         perf_output_copy(&handle, mmap_event->file_name,
 3765                                    mmap_event->file_size);
 3766         perf_output_end(&handle);
 3767 }
 3768 
 3769 static int perf_event_mmap_match(struct perf_event *event,
 3770                                    struct perf_mmap_event *mmap_event,
 3771                                    int executable)
 3772 {
 3773         if (event->state < PERF_EVENT_STATE_INACTIVE)
 3774                 return 0;
 3775 
 3776         if (event->cpu != -1 && event->cpu != smp_processor_id())
 3777                 return 0;
 3778 
 3779         if ((!executable && event->attr.mmap_data) ||
 3780             (executable && event->attr.mmap))
 3781                 return 1;
 3782 
 3783         return 0;
 3784 }
 3785 
 3786 static void perf_event_mmap_ctx(struct perf_event_context *ctx,
 3787                                   struct perf_mmap_event *mmap_event,
 3788                                   int executable)
 3789 {
 3790         struct perf_event *event;
 3791 
 3792         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 3793                 if (perf_event_mmap_match(event, mmap_event, executable))
 3794                         perf_event_mmap_output(event, mmap_event);
 3795         }
 3796 }
 3797 
 3798 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
 3799 {
 3800         struct perf_cpu_context *cpuctx;
 3801         struct perf_event_context *ctx;
 3802         struct vm_area_struct *vma = mmap_event->vma;
 3803         struct file *file = vma->vm_file;
 3804         unsigned int size;
 3805         char tmp[16];
 3806         char *buf = NULL;
 3807         const char *name;
 3808 
 3809         memset(tmp, 0, sizeof(tmp));
 3810 
 3811         if (file) {
 3812                 /*
 3813                  * d_path works from the end of the buffer backwards, so we
 3814                  * need to add enough zero bytes after the string to handle
 3815                  * the 64bit alignment we do later.
 3816                  */
 3817                 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
 3818                 if (!buf) {
 3819                         name = strncpy(tmp, "//enomem", sizeof(tmp));
 3820                         goto got_name;
 3821                 }
 3822                 name = d_path(&file->f_path, buf, PATH_MAX);
 3823                 if (IS_ERR(name)) {
 3824                         name = strncpy(tmp, "//toolong", sizeof(tmp));
 3825                         goto got_name;
 3826                 }
 3827         } else {
 3828                 if (arch_vma_name(mmap_event->vma)) {
 3829                         name = strncpy(tmp, arch_vma_name(mmap_event->vma),
 3830                                        sizeof(tmp));
 3831                         goto got_name;
 3832                 }
 3833 
 3834                 if (!vma->vm_mm) {
 3835                         name = strncpy(tmp, "[vdso]", sizeof(tmp));
 3836                         goto got_name;
 3837                 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
 3838                                 vma->vm_end >= vma->vm_mm->brk) {
 3839                         name = strncpy(tmp, "[heap]", sizeof(tmp));
 3840                         goto got_name;
 3841                 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
 3842                                 vma->vm_end >= vma->vm_mm->start_stack) {
 3843                         name = strncpy(tmp, "[stack]", sizeof(tmp));
 3844                         goto got_name;
 3845                 }
 3846 
 3847                 name = strncpy(tmp, "//anon", sizeof(tmp));
 3848                 goto got_name;
 3849         }
 3850 
 3851 got_name:
 3852         size = ALIGN(strlen(name)+1, sizeof(u64));
 3853 
 3854         mmap_event->file_name = name;
 3855         mmap_event->file_size = size;
 3856 
 3857         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
 3858 
 3859         rcu_read_lock();
 3860         cpuctx = &get_cpu_var(perf_cpu_context);
 3861         perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC);
 3862         ctx = rcu_dereference(current->perf_event_ctxp);
 3863         if (ctx)
 3864                 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
 3865         put_cpu_var(perf_cpu_context);
 3866         rcu_read_unlock();
 3867 
 3868         kfree(buf);
 3869 }
 3870 
 3871 void perf_event_mmap(struct vm_area_struct *vma)
 3872 {
 3873         struct perf_mmap_event mmap_event;
 3874 
 3875         if (!atomic_read(&nr_mmap_events))
 3876                 return;
 3877 
 3878         mmap_event = (struct perf_mmap_event){
 3879                 .vma    = vma,
 3880                 /* .file_name */
 3881                 /* .file_size */
 3882                 .event_id  = {
 3883                         .header = {
 3884                                 .type = PERF_RECORD_MMAP,
 3885                                 .misc = PERF_RECORD_MISC_USER,
 3886                                 /* .size */
 3887                         },
 3888                         /* .pid */
 3889                         /* .tid */
 3890                         .start  = vma->vm_start,
 3891                         .len    = vma->vm_end - vma->vm_start,
 3892                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
 3893                 },
 3894         };
 3895 
 3896         perf_event_mmap_event(&mmap_event);
 3897 }
 3898 
 3899 /*
 3900  * IRQ throttle logging
 3901  */
 3902 
 3903 static void perf_log_throttle(struct perf_event *event, int enable)
 3904 {
 3905         struct perf_output_handle handle;
 3906         int ret;
 3907 
 3908         struct {
 3909                 struct perf_event_header        header;
 3910                 u64                             time;
 3911                 u64                             id;
 3912                 u64                             stream_id;
 3913         } throttle_event = {
 3914                 .header = {
 3915                         .type = PERF_RECORD_THROTTLE,
 3916                         .misc = 0,
 3917                         .size = sizeof(throttle_event),
 3918                 },
 3919                 .time           = perf_clock(),
 3920                 .id             = primary_event_id(event),
 3921                 .stream_id      = event->id,
 3922         };
 3923 
 3924         if (enable)
 3925                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
 3926 
 3927         ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0);
 3928         if (ret)
 3929                 return;
 3930 
 3931         perf_output_put(&handle, throttle_event);
 3932         perf_output_end(&handle);
 3933 }
 3934 
 3935 /*
 3936  * Generic event overflow handling, sampling.
 3937  */
 3938 
 3939 static int __perf_event_overflow(struct perf_event *event, int nmi,
 3940                                    int throttle, struct perf_sample_data *data,
 3941                                    struct pt_regs *regs)
 3942 {
 3943         int events = atomic_read(&event->event_limit);
 3944         struct hw_perf_event *hwc = &event->hw;
 3945         int ret = 0;
 3946 
 3947         throttle = (throttle && event->pmu->unthrottle != NULL);
 3948 
 3949         if (!throttle) {
 3950                 hwc->interrupts++;
 3951         } else {
 3952                 if (hwc->interrupts != MAX_INTERRUPTS) {
 3953                         hwc->interrupts++;
 3954                         if (HZ * hwc->interrupts >
 3955                                         (u64)sysctl_perf_event_sample_rate) {
 3956                                 hwc->interrupts = MAX_INTERRUPTS;
 3957                                 perf_log_throttle(event, 0);
 3958                                 ret = 1;
 3959                         }
 3960                 } else {
 3961                         /*
 3962                          * Keep re-disabling events even though on the previous
 3963                          * pass we disabled it - just in case we raced with a
 3964                          * sched-in and the event got enabled again:
 3965                          */
 3966                         ret = 1;
 3967                 }
 3968         }
 3969 
 3970         if (event->attr.freq) {
 3971                 u64 now = perf_clock();
 3972                 s64 delta = now - hwc->freq_time_stamp;
 3973 
 3974                 hwc->freq_time_stamp = now;
 3975 
 3976                 if (delta > 0 && delta < 2*TICK_NSEC)
 3977                         perf_adjust_period(event, delta, hwc->last_period);
 3978         }
 3979 
 3980         /*
 3981          * XXX event_limit might not quite work as expected on inherited
 3982          * events
 3983          */
 3984 
 3985         event->pending_kill = POLL_IN;
 3986         if (events && atomic_dec_and_test(&event->event_limit)) {
 3987                 ret = 1;
 3988                 event->pending_kill = POLL_HUP;
 3989                 if (nmi) {
 3990                         event->pending_disable = 1;
 3991                         perf_pending_queue(&event->pending,
 3992                                            perf_pending_event);
 3993                 } else
 3994                         perf_event_disable(event);
 3995         }
 3996 
 3997         if (event->overflow_handler)
 3998                 event->overflow_handler(event, nmi, data, regs);
 3999         else
 4000                 perf_event_output(event, nmi, data, regs);
 4001 
 4002         return ret;
 4003 }
 4004 
 4005 int perf_event_overflow(struct perf_event *event, int nmi,
 4006                           struct perf_sample_data *data,
 4007                           struct pt_regs *regs)
 4008 {
 4009         return __perf_event_overflow(event, nmi, 1, data, regs);
 4010 }
 4011 
 4012 /*
 4013  * Generic software event infrastructure
 4014  */
 4015 
 4016 /*
 4017  * We directly increment event->count and keep a second value in
 4018  * event->hw.period_left to count intervals. This period event
 4019  * is kept in the range [-sample_period, 0] so that we can use the
 4020  * sign as trigger.
 4021  */
 4022 
 4023 static u64 perf_swevent_set_period(struct perf_event *event)
 4024 {
 4025         struct hw_perf_event *hwc = &event->hw;
 4026         u64 period = hwc->last_period;
 4027         u64 nr, offset;
 4028         s64 old, val;
 4029 
 4030         hwc->last_period = hwc->sample_period;
 4031 
 4032 again:
 4033         old = val = local64_read(&hwc->period_left);
 4034         if (val < 0)
 4035                 return 0;
 4036 
 4037         nr = div64_u64(period + val, period);
 4038         offset = nr * period;
 4039         val -= offset;
 4040         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
 4041                 goto again;
 4042 
 4043         return nr;
 4044 }
 4045 
 4046 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
 4047                                     int nmi, struct perf_sample_data *data,
 4048                                     struct pt_regs *regs)
 4049 {
 4050         struct hw_perf_event *hwc = &event->hw;
 4051         int throttle = 0;
 4052 
 4053         data->period = event->hw.last_period;
 4054         if (!overflow)
 4055                 overflow = perf_swevent_set_period(event);
 4056 
 4057         if (hwc->interrupts == MAX_INTERRUPTS)
 4058                 return;
 4059 
 4060         for (; overflow; overflow--) {
 4061                 if (__perf_event_overflow(event, nmi, throttle,
 4062                                             data, regs)) {
 4063                         /*
 4064                          * We inhibit the overflow from happening when
 4065                          * hwc->interrupts == MAX_INTERRUPTS.
 4066                          */
 4067                         break;
 4068                 }
 4069                 throttle = 1;
 4070         }
 4071 }
 4072 
 4073 static void perf_swevent_add(struct perf_event *event, u64 nr,
 4074                                int nmi, struct perf_sample_data *data,
 4075                                struct pt_regs *regs)
 4076 {
 4077         struct hw_perf_event *hwc = &event->hw;
 4078 
 4079         local64_add(nr, &event->count);
 4080 
 4081         if (!regs)
 4082                 return;
 4083 
 4084         if (!hwc->sample_period)
 4085                 return;
 4086 
 4087         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
 4088                 return perf_swevent_overflow(event, 1, nmi, data, regs);
 4089 
 4090         if (local64_add_negative(nr, &hwc->period_left))
 4091                 return;
 4092 
 4093         perf_swevent_overflow(event, 0, nmi, data, regs);
 4094 }
 4095 
 4096 static int perf_exclude_event(struct perf_event *event,
 4097                               struct pt_regs *regs)
 4098 {
 4099         if (regs) {
 4100                 if (event->attr.exclude_user && user_mode(regs))
 4101                         return 1;
 4102 
 4103                 if (event->attr.exclude_kernel && !user_mode(regs))
 4104                         return 1;
 4105         }
 4106 
 4107         return 0;
 4108 }
 4109 
 4110 static int perf_swevent_match(struct perf_event *event,
 4111                                 enum perf_type_id type,
 4112                                 u32 event_id,
 4113                                 struct perf_sample_data *data,
 4114                                 struct pt_regs *regs)
 4115 {
 4116         if (event->attr.type != type)
 4117                 return 0;
 4118 
 4119         if (event->attr.config != event_id)
 4120                 return 0;
 4121 
 4122         if (perf_exclude_event(event, regs))
 4123                 return 0;
 4124 
 4125         return 1;
 4126 }
 4127 
 4128 static inline u64 swevent_hash(u64 type, u32 event_id)
 4129 {
 4130         u64 val = event_id | (type << 32);
 4131 
 4132         return hash_64(val, SWEVENT_HLIST_BITS);
 4133 }
 4134 
 4135 static inline struct hlist_head *
 4136 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
 4137 {
 4138         u64 hash = swevent_hash(type, event_id);
 4139 
 4140         return &hlist->heads[hash];
 4141 }
 4142 
 4143 /* For the read side: events when they trigger */
 4144 static inline struct hlist_head *
 4145 find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
 4146 {
 4147         struct swevent_hlist *hlist;
 4148 
 4149         hlist = rcu_dereference(ctx->swevent_hlist);
 4150         if (!hlist)
 4151                 return NULL;
 4152 
 4153         return __find_swevent_head(hlist, type, event_id);
 4154 }
 4155 
 4156 /* For the event head insertion and removal in the hlist */
 4157 static inline struct hlist_head *
 4158 find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
 4159 {
 4160         struct swevent_hlist *hlist;
 4161         u32 event_id = event->attr.config;
 4162         u64 type = event->attr.type;
 4163 
 4164         /*
 4165          * Event scheduling is always serialized against hlist allocation
 4166          * and release. Which makes the protected version suitable here.
 4167          * The context lock guarantees that.
 4168          */
 4169         hlist = rcu_dereference_protected(ctx->swevent_hlist,
 4170                                           lockdep_is_held(&event->ctx->lock));
 4171         if (!hlist)
 4172                 return NULL;
 4173 
 4174         return __find_swevent_head(hlist, type, event_id);
 4175 }
 4176 
 4177 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
 4178                                     u64 nr, int nmi,
 4179                                     struct perf_sample_data *data,
 4180                                     struct pt_regs *regs)
 4181 {
 4182         struct perf_cpu_context *cpuctx;
 4183         struct perf_event *event;
 4184         struct hlist_node *node;
 4185         struct hlist_head *head;
 4186 
 4187         cpuctx = &__get_cpu_var(perf_cpu_context);
 4188 
 4189         rcu_read_lock();
 4190 
 4191         head = find_swevent_head_rcu(cpuctx, type, event_id);
 4192 
 4193         if (!head)
 4194                 goto end;
 4195 
 4196         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 4197                 if (perf_swevent_match(event, type, event_id, data, regs))
 4198                         perf_swevent_add(event, nr, nmi, data, regs);
 4199         }
 4200 end:
 4201         rcu_read_unlock();
 4202 }
 4203 
 4204 int perf_swevent_get_recursion_context(void)
 4205 {
 4206         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 4207         int rctx;
 4208 
 4209         if (in_nmi())
 4210                 rctx = 3;
 4211         else if (in_irq())
 4212                 rctx = 2;
 4213         else if (in_softirq())
 4214                 rctx = 1;
 4215         else
 4216                 rctx = 0;
 4217 
 4218         if (cpuctx->recursion[rctx])
 4219                 return -1;
 4220 
 4221         cpuctx->recursion[rctx]++;
 4222         barrier();
 4223 
 4224         return rctx;
 4225 }
 4226 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 4227 
 4228 void inline perf_swevent_put_recursion_context(int rctx)
 4229 {
 4230         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 4231         barrier();
 4232         cpuctx->recursion[rctx]--;
 4233 }
 4234 
 4235 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
 4236                             struct pt_regs *regs, u64 addr)
 4237 {
 4238         struct perf_sample_data data;
 4239         int rctx;
 4240 
 4241         preempt_disable_notrace();
 4242         rctx = perf_swevent_get_recursion_context();
 4243         if (rctx < 0)
 4244                 return;
 4245 
 4246         perf_sample_data_init(&data, addr);
 4247 
 4248         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
 4249 
 4250         perf_swevent_put_recursion_context(rctx);
 4251         preempt_enable_notrace();
 4252 }
 4253 
 4254 static void perf_swevent_read(struct perf_event *event)
 4255 {
 4256 }
 4257 
 4258 static int perf_swevent_enable(struct perf_event *event)
 4259 {
 4260         struct hw_perf_event *hwc = &event->hw;
 4261         struct perf_cpu_context *cpuctx;
 4262         struct hlist_head *head;
 4263 
 4264         cpuctx = &__get_cpu_var(perf_cpu_context);
 4265 
 4266         if (hwc->sample_period) {
 4267                 hwc->last_period = hwc->sample_period;
 4268                 perf_swevent_set_period(event);
 4269         }
 4270 
 4271         head = find_swevent_head(cpuctx, event);
 4272         if (WARN_ON_ONCE(!head))
 4273                 return -EINVAL;
 4274 
 4275         hlist_add_head_rcu(&event->hlist_entry, head);
 4276 
 4277         return 0;
 4278 }
 4279 
 4280 static void perf_swevent_disable(struct perf_event *event)
 4281 {
 4282         hlist_del_rcu(&event->hlist_entry);
 4283 }
 4284 
 4285 static void perf_swevent_void(struct perf_event *event)
 4286 {
 4287 }
 4288 
 4289 static int perf_swevent_int(struct perf_event *event)
 4290 {
 4291         return 0;
 4292 }
 4293 
 4294 static const struct pmu perf_ops_generic = {
 4295         .enable         = perf_swevent_enable,
 4296         .disable        = perf_swevent_disable,
 4297         .start          = perf_swevent_int,
 4298         .stop           = perf_swevent_void,
 4299         .read           = perf_swevent_read,
 4300         .unthrottle     = perf_swevent_void, /* hwc->interrupts already reset */
 4301 };
 4302 
 4303 /*
 4304  * hrtimer based swevent callback
 4305  */
 4306 
 4307 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 4308 {
 4309         enum hrtimer_restart ret = HRTIMER_RESTART;
 4310         struct perf_sample_data data;
 4311         struct pt_regs *regs;
 4312         struct perf_event *event;
 4313         u64 period;
 4314 
 4315         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
 4316         event->pmu->read(event);
 4317 
 4318         perf_sample_data_init(&data, 0);
 4319         data.period = event->hw.last_period;
 4320         regs = get_irq_regs();
 4321 
 4322         if (regs && !perf_exclude_event(event, regs)) {
 4323                 if (!(event->attr.exclude_idle && current->pid == 0))
 4324                         if (perf_event_overflow(event, 0, &data, regs))
 4325                                 ret = HRTIMER_NORESTART;
 4326         }
 4327 
 4328         period = max_t(u64, 10000, event->hw.sample_period);
 4329         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 4330 
 4331         return ret;
 4332 }
 4333 
 4334 static void perf_swevent_start_hrtimer(struct perf_event *event)
 4335 {
 4336         struct hw_perf_event *hwc = &event->hw;
 4337 
 4338         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 4339         hwc->hrtimer.function = perf_swevent_hrtimer;
 4340         if (hwc->sample_period) {
 4341                 u64 period;
 4342 
 4343                 if (hwc->remaining) {
 4344                         if (hwc->remaining < 0)
 4345                                 period = 10000;
 4346                         else
 4347                                 period = hwc->remaining;
 4348                         hwc->remaining = 0;
 4349                 } else {
 4350                         period = max_t(u64, 10000, hwc->sample_period);
 4351                 }
 4352                 __hrtimer_start_range_ns(&hwc->hrtimer,
 4353                                 ns_to_ktime(period), 0,
 4354                                 HRTIMER_MODE_REL, 0);
 4355         }
 4356 }
 4357 
 4358 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
 4359 {
 4360         struct hw_perf_event *hwc = &event->hw;
 4361 
 4362         if (hwc->sample_period) {
 4363                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
 4364                 hwc->remaining = ktime_to_ns(remaining);
 4365 
 4366                 hrtimer_cancel(&hwc->hrtimer);
 4367         }
 4368 }
 4369 
 4370 /*
 4371  * Software event: cpu wall time clock
 4372  */
 4373 
 4374 static void cpu_clock_perf_event_update(struct perf_event *event)
 4375 {
 4376         int cpu = raw_smp_processor_id();
 4377         s64 prev;
 4378         u64 now;
 4379 
 4380         now = cpu_clock(cpu);
 4381         prev = local64_xchg(&event->hw.prev_count, now);
 4382         local64_add(now - prev, &event->count);
 4383 }
 4384 
 4385 static int cpu_clock_perf_event_enable(struct perf_event *event)
 4386 {
 4387         struct hw_perf_event *hwc = &event->hw;
 4388         int cpu = raw_smp_processor_id();
 4389 
 4390         local64_set(&hwc->prev_count, cpu_clock(cpu));
 4391         perf_swevent_start_hrtimer(event);
 4392 
 4393         return 0;
 4394 }
 4395 
 4396 static void cpu_clock_perf_event_disable(struct perf_event *event)
 4397 {
 4398         perf_swevent_cancel_hrtimer(event);
 4399         cpu_clock_perf_event_update(event);
 4400 }
 4401 
 4402 static void cpu_clock_perf_event_read(struct perf_event *event)
 4403 {
 4404         cpu_clock_perf_event_update(event);
 4405 }
 4406 
 4407 static const struct pmu perf_ops_cpu_clock = {
 4408         .enable         = cpu_clock_perf_event_enable,
 4409         .disable        = cpu_clock_perf_event_disable,
 4410         .read           = cpu_clock_perf_event_read,
 4411 };
 4412 
 4413 /*
 4414  * Software event: task time clock
 4415  */
 4416 
 4417 static void task_clock_perf_event_update(struct perf_event *event, u64 now)
 4418 {
 4419         u64 prev;
 4420         s64 delta;
 4421 
 4422         prev = local64_xchg(&event->hw.prev_count, now);
 4423         delta = now - prev;
 4424         local64_add(delta, &event->count);
 4425 }
 4426 
 4427 static int task_clock_perf_event_enable(struct perf_event *event)
 4428 {
 4429         struct hw_perf_event *hwc = &event->hw;
 4430         u64 now;
 4431 
 4432         now = event->ctx->time;
 4433 
 4434         local64_set(&hwc->prev_count, now);
 4435 
 4436         perf_swevent_start_hrtimer(event);
 4437 
 4438         return 0;
 4439 }
 4440 
 4441 static void task_clock_perf_event_disable(struct perf_event *event)
 4442 {
 4443         perf_swevent_cancel_hrtimer(event);
 4444         task_clock_perf_event_update(event, event->ctx->time);
 4445 
 4446 }
 4447 
 4448 static void task_clock_perf_event_read(struct perf_event *event)
 4449 {
 4450         u64 time;
 4451 
 4452         if (!in_nmi()) {
 4453                 update_context_time(event->ctx);
 4454                 time = event->ctx->time;
 4455         } else {
 4456                 u64 now = perf_clock();
 4457                 u64 delta = now - event->ctx->timestamp;
 4458                 time = event->ctx->time + delta;
 4459         }
 4460 
 4461         task_clock_perf_event_update(event, time);
 4462 }
 4463 
 4464 static const struct pmu perf_ops_task_clock = {
 4465         .enable         = task_clock_perf_event_enable,
 4466         .disable        = task_clock_perf_event_disable,
 4467         .read           = task_clock_perf_event_read,
 4468 };
 4469 
 4470 /* Deref the hlist from the update side */
 4471 static inline struct swevent_hlist *
 4472 swevent_hlist_deref(struct perf_cpu_context *cpuctx)
 4473 {
 4474         return rcu_dereference_protected(cpuctx->swevent_hlist,
 4475                                          lockdep_is_held(&cpuctx->hlist_mutex));
 4476 }
 4477 
 4478 static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
 4479 {
 4480         struct swevent_hlist *hlist;
 4481 
 4482         hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
 4483         kfree(hlist);
 4484 }
 4485 
 4486 static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
 4487 {
 4488         struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
 4489 
 4490         if (!hlist)
 4491                 return;
 4492 
 4493         rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
 4494         call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
 4495 }
 4496 
 4497 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
 4498 {
 4499         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 4500 
 4501         mutex_lock(&cpuctx->hlist_mutex);
 4502 
 4503         if (!--cpuctx->hlist_refcount)
 4504                 swevent_hlist_release(cpuctx);
 4505 
 4506         mutex_unlock(&cpuctx->hlist_mutex);
 4507 }
 4508 
 4509 static void swevent_hlist_put(struct perf_event *event)
 4510 {
 4511         int cpu;
 4512 
 4513         if (event->cpu != -1) {
 4514                 swevent_hlist_put_cpu(event, event->cpu);
 4515                 return;
 4516         }
 4517 
 4518         for_each_possible_cpu(cpu)
 4519                 swevent_hlist_put_cpu(event, cpu);
 4520 }
 4521 
 4522 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
 4523 {
 4524         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 4525         int err = 0;
 4526 
 4527         mutex_lock(&cpuctx->hlist_mutex);
 4528 
 4529         if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
 4530                 struct swevent_hlist *hlist;
 4531 
 4532                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
 4533                 if (!hlist) {
 4534                         err = -ENOMEM;
 4535                         goto exit;
 4536                 }
 4537                 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
 4538         }
 4539         cpuctx->hlist_refcount++;
 4540  exit:
 4541         mutex_unlock(&cpuctx->hlist_mutex);
 4542 
 4543         return err;
 4544 }
 4545 
 4546 static int swevent_hlist_get(struct perf_event *event)
 4547 {
 4548         int err;
 4549         int cpu, failed_cpu;
 4550 
 4551         if (event->cpu != -1)
 4552                 return swevent_hlist_get_cpu(event, event->cpu);
 4553 
 4554         get_online_cpus();
 4555         for_each_possible_cpu(cpu) {
 4556                 err = swevent_hlist_get_cpu(event, cpu);
 4557                 if (err) {
 4558                         failed_cpu = cpu;
 4559                         goto fail;
 4560                 }
 4561         }
 4562         put_online_cpus();
 4563 
 4564         return 0;
 4565  fail:
 4566         for_each_possible_cpu(cpu) {
 4567                 if (cpu == failed_cpu)
 4568                         break;
 4569                 swevent_hlist_put_cpu(event, cpu);
 4570         }
 4571 
 4572         put_online_cpus();
 4573         return err;
 4574 }
 4575 
 4576 #ifdef CONFIG_EVENT_TRACING
 4577 
 4578 static const struct pmu perf_ops_tracepoint = {
 4579         .enable         = perf_trace_enable,
 4580         .disable        = perf_trace_disable,
 4581         .start          = perf_swevent_int,
 4582         .stop           = perf_swevent_void,
 4583         .read           = perf_swevent_read,
 4584         .unthrottle     = perf_swevent_void,
 4585 };
 4586 
 4587 static int perf_tp_filter_match(struct perf_event *event,
 4588                                 struct perf_sample_data *data)
 4589 {
 4590         void *record = data->raw->data;
 4591 
 4592         if (likely(!event->filter) || filter_match_preds(event->filter, record))
 4593                 return 1;
 4594         return 0;
 4595 }
 4596 
 4597 static int perf_tp_event_match(struct perf_event *event,
 4598                                 struct perf_sample_data *data,
 4599                                 struct pt_regs *regs)
 4600 {
 4601         /*
 4602          * All tracepoints are from kernel-space.
 4603          */
 4604         if (event->attr.exclude_kernel)
 4605                 return 0;
 4606 
 4607         if (!perf_tp_filter_match(event, data))
 4608                 return 0;
 4609 
 4610         return 1;
 4611 }
 4612 
 4613 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
 4614                    struct pt_regs *regs, struct hlist_head *head, int rctx)
 4615 {
 4616         struct perf_sample_data data;
 4617         struct perf_event *event;
 4618         struct hlist_node *node;
 4619 
 4620         struct perf_raw_record raw = {
 4621                 .size = entry_size,
 4622                 .data = record,
 4623         };
 4624 
 4625         perf_sample_data_init(&data, addr);
 4626         data.raw = &raw;
 4627 
 4628         hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
 4629                 if (perf_tp_event_match(event, &data, regs))
 4630                         perf_swevent_add(event, count, 1, &data, regs);
 4631         }
 4632 
 4633         perf_swevent_put_recursion_context(rctx);
 4634 }
 4635 EXPORT_SYMBOL_GPL(perf_tp_event);
 4636 
 4637 static void tp_perf_event_destroy(struct perf_event *event)
 4638 {
 4639         perf_trace_destroy(event);
 4640 }
 4641 
 4642 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 4643 {
 4644         int err;
 4645 
 4646         /*
 4647          * Raw tracepoint data is a severe data leak, only allow root to
 4648          * have these.
 4649          */
 4650         if ((event->attr.sample_type & PERF_SAMPLE_RAW) &&
 4651                         perf_paranoid_tracepoint_raw() &&
 4652                         !capable(CAP_SYS_ADMIN))
 4653                 return ERR_PTR(-EPERM);
 4654 
 4655         err = perf_trace_init(event);
 4656         if (err)
 4657                 return NULL;
 4658 
 4659         event->destroy = tp_perf_event_destroy;
 4660 
 4661         return &perf_ops_tracepoint;
 4662 }
 4663 
 4664 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 4665 {
 4666         char *filter_str;
 4667         int ret;
 4668 
 4669         if (event->attr.type != PERF_TYPE_TRACEPOINT)
 4670                 return -EINVAL;
 4671 
 4672         filter_str = strndup_user(arg, PAGE_SIZE);
 4673         if (IS_ERR(filter_str))
 4674                 return PTR_ERR(filter_str);
 4675 
 4676         ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
 4677 
 4678         kfree(filter_str);
 4679         return ret;
 4680 }
 4681 
 4682 static void perf_event_free_filter(struct perf_event *event)
 4683 {
 4684         ftrace_profile_free_filter(event);
 4685 }
 4686 
 4687 #else
 4688 
 4689 static const struct pmu *tp_perf_event_init(struct perf_event *event)
 4690 {
 4691         return NULL;
 4692 }
 4693 
 4694 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
 4695 {
 4696         return -ENOENT;
 4697 }
 4698 
 4699 static void perf_event_free_filter(struct perf_event *event)
 4700 {
 4701 }
 4702 
 4703 #endif /* CONFIG_EVENT_TRACING */
 4704 
 4705 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 4706 static void bp_perf_event_destroy(struct perf_event *event)
 4707 {
 4708         release_bp_slot(event);
 4709 }
 4710 
 4711 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
 4712 {
 4713         int err;
 4714 
 4715         err = register_perf_hw_breakpoint(bp);
 4716         if (err)
 4717                 return ERR_PTR(err);
 4718 
 4719         bp->destroy = bp_perf_event_destroy;
 4720 
 4721         return &perf_ops_bp;
 4722 }
 4723 
 4724 void perf_bp_event(struct perf_event *bp, void *data)
 4725 {
 4726         struct perf_sample_data sample;
 4727         struct pt_regs *regs = data;
 4728 
 4729         perf_sample_data_init(&sample, bp->attr.bp_addr);
 4730 
 4731         if (!perf_exclude_event(bp, regs))
 4732                 perf_swevent_add(bp, 1, 1, &sample, regs);
 4733 }
 4734 #else
 4735 static const struct pmu *bp_perf_event_init(struct perf_event *bp)
 4736 {
 4737         return NULL;
 4738 }
 4739 
 4740 void perf_bp_event(struct perf_event *bp, void *regs)
 4741 {
 4742 }
 4743 #endif
 4744 
 4745 atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
 4746 
 4747 static void sw_perf_event_destroy(struct perf_event *event)
 4748 {
 4749         u64 event_id = event->attr.config;
 4750 
 4751         WARN_ON(event->parent);
 4752 
 4753         atomic_dec(&perf_swevent_enabled[event_id]);
 4754         swevent_hlist_put(event);
 4755 }
 4756 
 4757 static const struct pmu *sw_perf_event_init(struct perf_event *event)
 4758 {
 4759         const struct pmu *pmu = NULL;
 4760         u64 event_id = event->attr.config;
 4761 
 4762         /*
 4763          * Software events (currently) can't in general distinguish
 4764          * between user, kernel and hypervisor events.
 4765          * However, context switches and cpu migrations are considered
 4766          * to be kernel events, and page faults are never hypervisor
 4767          * events.
 4768          */
 4769         switch (event_id) {
 4770         case PERF_COUNT_SW_CPU_CLOCK:
 4771                 pmu = &perf_ops_cpu_clock;
 4772 
 4773                 break;
 4774         case PERF_COUNT_SW_TASK_CLOCK:
 4775                 /*
 4776                  * If the user instantiates this as a per-cpu event,
 4777                  * use the cpu_clock event instead.
 4778                  */
 4779                 if (event->ctx->task)
 4780                         pmu = &perf_ops_task_clock;
 4781                 else
 4782                         pmu = &perf_ops_cpu_clock;
 4783 
 4784                 break;
 4785         case PERF_COUNT_SW_PAGE_FAULTS:
 4786         case PERF_COUNT_SW_PAGE_FAULTS_MIN:
 4787         case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
 4788         case PERF_COUNT_SW_CONTEXT_SWITCHES:
 4789         case PERF_COUNT_SW_CPU_MIGRATIONS:
 4790         case PERF_COUNT_SW_ALIGNMENT_FAULTS:
 4791         case PERF_COUNT_SW_EMULATION_FAULTS:
 4792                 if (!event->parent) {
 4793                         int err;
 4794 
 4795                         err = swevent_hlist_get(event);
 4796                         if (err)
 4797                                 return ERR_PTR(err);
 4798 
 4799                         atomic_inc(&perf_swevent_enabled[event_id]);
 4800                         event->destroy = sw_perf_event_destroy;
 4801                 }
 4802                 pmu = &perf_ops_generic;
 4803                 break;
 4804         }
 4805 
 4806         return pmu;
 4807 }
 4808 
 4809 /*
 4810  * Allocate and initialize a event structure
 4811  */
 4812 static struct perf_event *
 4813 perf_event_alloc(struct perf_event_attr *attr,
 4814                    int cpu,
 4815                    struct perf_event_context *ctx,
 4816                    struct perf_event *group_leader,
 4817                    struct perf_event *parent_event,
 4818                    perf_overflow_handler_t overflow_handler,
 4819                    gfp_t gfpflags)
 4820 {
 4821         const struct pmu *pmu;
 4822         struct perf_event *event;
 4823         struct hw_perf_event *hwc;
 4824         long err;
 4825 
 4826         event = kzalloc(sizeof(*event), gfpflags);
 4827         if (!event)
 4828                 return ERR_PTR(-ENOMEM);
 4829 
 4830         /*
 4831          * Single events are their own group leaders, with an
 4832          * empty sibling list:
 4833          */
 4834         if (!group_leader)
 4835                 group_leader = event;
 4836 
 4837         mutex_init(&event->child_mutex);
 4838         INIT_LIST_HEAD(&event->child_list);
 4839 
 4840         INIT_LIST_HEAD(&event->group_entry);
 4841         INIT_LIST_HEAD(&event->event_entry);
 4842         INIT_LIST_HEAD(&event->sibling_list);
 4843         init_waitqueue_head(&event->waitq);
 4844 
 4845         mutex_init(&event->mmap_mutex);
 4846 
 4847         event->cpu              = cpu;
 4848         event->attr             = *attr;
 4849         event->group_leader     = group_leader;
 4850         event->pmu              = NULL;
 4851         event->ctx              = ctx;
 4852         event->oncpu            = -1;
 4853 
 4854         event->parent           = parent_event;
 4855 
 4856         event->ns               = get_pid_ns(current->nsproxy->pid_ns);
 4857         event->id               = atomic64_inc_return(&perf_event_id);
 4858 
 4859         event->state            = PERF_EVENT_STATE_INACTIVE;
 4860 
 4861         if (!overflow_handler && parent_event)
 4862                 overflow_handler = parent_event->overflow_handler;
 4863         
 4864         event->overflow_handler = overflow_handler;
 4865 
 4866         if (attr->disabled)
 4867                 event->state = PERF_EVENT_STATE_OFF;
 4868 
 4869         pmu = NULL;
 4870 
 4871         hwc = &event->hw;
 4872         hwc->sample_period = attr->sample_period;
 4873         if (attr->freq && attr->sample_freq)
 4874                 hwc->sample_period = 1;
 4875         hwc->last_period = hwc->sample_period;
 4876 
 4877         local64_set(&hwc->period_left, hwc->sample_period);
 4878 
 4879         /*
 4880          * we currently do not support PERF_FORMAT_GROUP on inherited events
 4881          */
 4882         if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 4883                 goto done;
 4884 
 4885         switch (attr->type) {
 4886         case PERF_TYPE_RAW:
 4887         case PERF_TYPE_HARDWARE:
 4888         case PERF_TYPE_HW_CACHE:
 4889                 pmu = hw_perf_event_init(event);
 4890                 break;
 4891 
 4892         case PERF_TYPE_SOFTWARE:
 4893                 pmu = sw_perf_event_init(event);
 4894                 break;
 4895 
 4896         case PERF_TYPE_TRACEPOINT:
 4897                 pmu = tp_perf_event_init(event);
 4898                 break;
 4899 
 4900         case PERF_TYPE_BREAKPOINT:
 4901                 pmu = bp_perf_event_init(event);
 4902                 break;
 4903 
 4904 
 4905         default:
 4906                 break;
 4907         }
 4908 done:
 4909         err = 0;
 4910         if (!pmu)
 4911                 err = -EINVAL;
 4912         else if (IS_ERR(pmu))
 4913                 err = PTR_ERR(pmu);
 4914 
 4915         if (err) {
 4916                 if (event->ns)
 4917                         put_pid_ns(event->ns);
 4918                 kfree(event);
 4919                 return ERR_PTR(err);
 4920         }
 4921 
 4922         event->pmu = pmu;
 4923 
 4924         if (!event->parent) {
 4925                 atomic_inc(&nr_events);
 4926                 if (event->attr.mmap || event->attr.mmap_data)
 4927                         atomic_inc(&nr_mmap_events);
 4928                 if (event->attr.comm)
 4929                         atomic_inc(&nr_comm_events);
 4930                 if (event->attr.task)
 4931                         atomic_inc(&nr_task_events);
 4932         }
 4933 
 4934         return event;
 4935 }
 4936 
 4937 static int perf_copy_attr(struct perf_event_attr __user *uattr,
 4938                           struct perf_event_attr *attr)
 4939 {
 4940         u32 size;
 4941         int ret;
 4942 
 4943         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
 4944                 return -EFAULT;
 4945 
 4946         /*
 4947          * zero the full structure, so that a short copy will be nice.
 4948          */
 4949         memset(attr, 0, sizeof(*attr));
 4950 
 4951         ret = get_user(size, &uattr->size);
 4952         if (ret)
 4953                 return ret;
 4954 
 4955         if (size > PAGE_SIZE)   /* silly large */
 4956                 goto err_size;
 4957 
 4958         if (!size)              /* abi compat */
 4959                 size = PERF_ATTR_SIZE_VER0;
 4960 
 4961         if (size < PERF_ATTR_SIZE_VER0)
 4962                 goto err_size;
 4963 
 4964         /*
 4965          * If we're handed a bigger struct than we know of,
 4966          * ensure all the unknown bits are 0 - i.e. new
 4967          * user-space does not rely on any kernel feature
 4968          * extensions we dont know about yet.
 4969          */
 4970         if (size > sizeof(*attr)) {
 4971                 unsigned char __user *addr;
 4972                 unsigned char __user *end;
 4973                 unsigned char val;
 4974 
 4975                 addr = (void __user *)uattr + sizeof(*attr);
 4976                 end  = (void __user *)uattr + size;
 4977 
 4978                 for (; addr < end; addr++) {
 4979                         ret = get_user(val, addr);
 4980                         if (ret)
 4981                                 return ret;
 4982                         if (val)
 4983                                 goto err_size;
 4984                 }
 4985                 size = sizeof(*attr);
 4986         }
 4987 
 4988         ret = copy_from_user(attr, uattr, size);
 4989         if (ret)
 4990                 return -EFAULT;
 4991 
 4992         /*
 4993          * If the type exists, the corresponding creation will verify
 4994          * the attr->config.
 4995          */
 4996         if (attr->type >= PERF_TYPE_MAX)
 4997                 return -EINVAL;
 4998 
 4999         if (attr->__reserved_1)
 5000                 return -EINVAL;
 5001 
 5002         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
 5003                 return -EINVAL;
 5004 
 5005         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
 5006                 return -EINVAL;
 5007 
 5008 out:
 5009         return ret;
 5010 
 5011 err_size:
 5012         put_user(sizeof(*attr), &uattr->size);
 5013         ret = -E2BIG;
 5014         goto out;
 5015 }
 5016 
 5017 static int
 5018 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 5019 {
 5020         struct perf_buffer *buffer = NULL, *old_buffer = NULL;
 5021         int ret = -EINVAL;
 5022 
 5023         if (!output_event)
 5024                 goto set;
 5025 
 5026         /* don't allow circular references */
 5027         if (event == output_event)
 5028                 goto out;
 5029 
 5030         /*
 5031          * Don't allow cross-cpu buffers
 5032          */
 5033         if (output_event->cpu != event->cpu)
 5034                 goto out;
 5035 
 5036         /*
 5037          * If its not a per-cpu buffer, it must be the same task.
 5038          */
 5039         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 5040                 goto out;
 5041 
 5042 set:
 5043         mutex_lock(&event->mmap_mutex);
 5044         /* Can't redirect output if we've got an active mmap() */
 5045         if (atomic_read(&event->mmap_count))
 5046                 goto unlock;
 5047 
 5048         if (output_event) {
 5049                 /* get the buffer we want to redirect to */
 5050                 buffer = perf_buffer_get(output_event);
 5051                 if (!buffer)
 5052                         goto unlock;
 5053         }
 5054 
 5055         old_buffer = event->buffer;
 5056         rcu_assign_pointer(event->buffer, buffer);
 5057         ret = 0;
 5058 unlock:
 5059         mutex_unlock(&event->mmap_mutex);
 5060 
 5061         if (old_buffer)
 5062                 perf_buffer_put(old_buffer);
 5063 out:
 5064         return ret;
 5065 }
 5066 
 5067 /**
 5068  * sys_perf_event_open - open a performance event, associate it to a task/cpu
 5069  *
 5070  * @attr_uptr:  event_id type attributes for monitoring/sampling
 5071  * @pid:                target pid
 5072  * @cpu:                target cpu
 5073  * @group_fd:           group leader event fd
 5074  */
 5075 SYSCALL_DEFINE5(perf_event_open,
 5076                 struct perf_event_attr __user *, attr_uptr,
 5077                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 5078 {
 5079         struct perf_event *event, *group_leader = NULL, *output_event = NULL;
 5080         struct perf_event_attr attr;
 5081         struct perf_event_context *ctx;
 5082         struct file *event_file = NULL;
 5083         struct file *group_file = NULL;
 5084         int event_fd;
 5085         int fput_needed = 0;
 5086         int err;
 5087 
 5088         /* for future expandability... */
 5089         if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
 5090                 return -EINVAL;
 5091 
 5092         err = perf_copy_attr(attr_uptr, &attr);
 5093         if (err)
 5094                 return err;
 5095 
 5096         if (!attr.exclude_kernel) {
 5097                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
 5098                         return -EACCES;
 5099         }
 5100 
 5101         if (attr.freq) {
 5102                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
 5103                         return -EINVAL;
 5104         }
 5105 
 5106         event_fd = get_unused_fd_flags(O_RDWR);
 5107         if (event_fd < 0)
 5108                 return event_fd;
 5109 
 5110         /*
 5111          * Get the target context (task or percpu):
 5112          */
 5113         ctx = find_get_context(pid, cpu);
 5114         if (IS_ERR(ctx)) {
 5115                 err = PTR_ERR(ctx);
 5116                 goto err_fd;
 5117         }
 5118 
 5119         if (group_fd != -1) {
 5120                 group_leader = perf_fget_light(group_fd, &fput_needed);
 5121                 if (IS_ERR(group_leader)) {
 5122                         err = PTR_ERR(group_leader);
 5123                         goto err_put_context;
 5124                 }
 5125                 group_file = group_leader->filp;
 5126                 if (flags & PERF_FLAG_FD_OUTPUT)
 5127                         output_event = group_leader;
 5128                 if (flags & PERF_FLAG_FD_NO_GROUP)
 5129                         group_leader = NULL;
 5130         }
 5131 
 5132         /*
 5133          * Look up the group leader (we will attach this event to it):
 5134          */
 5135         if (group_leader) {
 5136                 err = -EINVAL;
 5137 
 5138                 /*
 5139                  * Do not allow a recursive hierarchy (this new sibling
 5140                  * becoming part of another group-sibling):
 5141                  */
 5142                 if (group_leader->group_leader != group_leader)
 5143                         goto err_put_context;
 5144                 /*
 5145                  * Do not allow to attach to a group in a different
 5146                  * task or CPU context:
 5147                  */
 5148                 if (group_leader->ctx != ctx)
 5149                         goto err_put_context;
 5150                 /*
 5151                  * Only a group leader can be exclusive or pinned
 5152                  */
 5153                 if (attr.exclusive || attr.pinned)
 5154                         goto err_put_context;
 5155         }
 5156 
 5157         event = perf_event_alloc(&attr, cpu, ctx, group_leader,
 5158                                      NULL, NULL, GFP_KERNEL);
 5159         if (IS_ERR(event)) {
 5160                 err = PTR_ERR(event);
 5161                 goto err_put_context;
 5162         }
 5163 
 5164         if (output_event) {
 5165                 err = perf_event_set_output(event, output_event);
 5166                 if (err)
 5167                         goto err_free_put_context;
 5168         }
 5169 
 5170         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
 5171         if (IS_ERR(event_file)) {
 5172                 err = PTR_ERR(event_file);
 5173                 goto err_free_put_context;
 5174         }
 5175 
 5176         event->filp = event_file;
 5177         WARN_ON_ONCE(ctx->parent_ctx);
 5178         mutex_lock(&ctx->mutex);
 5179         perf_install_in_context(ctx, event, cpu);
 5180         ++ctx->generation;
 5181         mutex_unlock(&ctx->mutex);
 5182 
 5183         event->owner = current;
 5184         get_task_struct(current);
 5185         mutex_lock(&current->perf_event_mutex);
 5186         list_add_tail(&event->owner_entry, &current->perf_event_list);
 5187         mutex_unlock(&current->perf_event_mutex);
 5188 
 5189         /*
 5190          * Drop the reference on the group_event after placing the
 5191          * new event on the sibling_list. This ensures destruction
 5192          * of the group leader will find the pointer to itself in
 5193          * perf_group_detach().
 5194          */
 5195         fput_light(group_file, fput_needed);
 5196         fd_install(event_fd, event_file);
 5197         return event_fd;
 5198 
 5199 err_free_put_context:
 5200         free_event(event);
 5201 err_put_context:
 5202         fput_light(group_file, fput_needed);
 5203         put_ctx(ctx);
 5204 err_fd:
 5205         put_unused_fd(event_fd);
 5206         return err;
 5207 }
 5208 
 5209 /**
 5210  * perf_event_create_kernel_counter
 5211  *
 5212  * @attr: attributes of the counter to create
 5213  * @cpu: cpu in which the counter is bound
 5214  * @pid: task to profile
 5215  */
 5216 struct perf_event *
 5217 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
 5218                                  pid_t pid,
 5219                                  perf_overflow_handler_t overflow_handler)
 5220 {
 5221         struct perf_event *event;
 5222         struct perf_event_context *ctx;
 5223         int err;
 5224 
 5225         /*
 5226          * Get the target context (task or percpu):
 5227          */
 5228 
 5229         ctx = find_get_context(pid, cpu);
 5230         if (IS_ERR(ctx)) {
 5231                 err = PTR_ERR(ctx);
 5232                 goto err_exit;
 5233         }
 5234 
 5235         event = perf_event_alloc(attr, cpu, ctx, NULL,
 5236                                  NULL, overflow_handler, GFP_KERNEL);
 5237         if (IS_ERR(event)) {
 5238                 err = PTR_ERR(event);
 5239                 goto err_put_context;
 5240         }
 5241 
 5242         event->filp = NULL;
 5243         WARN_ON_ONCE(ctx->parent_ctx);
 5244         mutex_lock(&ctx->mutex);
 5245         perf_install_in_context(ctx, event, cpu);
 5246         ++ctx->generation;
 5247         mutex_unlock(&ctx->mutex);
 5248 
 5249         event->owner = current;
 5250         get_task_struct(current);
 5251         mutex_lock(&current->perf_event_mutex);
 5252         list_add_tail(&event->owner_entry, &current->perf_event_list);
 5253         mutex_unlock(&current->perf_event_mutex);
 5254 
 5255         return event;
 5256 
 5257  err_put_context:
 5258         put_ctx(ctx);
 5259  err_exit:
 5260         return ERR_PTR(err);
 5261 }
 5262 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
 5263 
 5264 /*
 5265  * inherit a event from parent task to child task:
 5266  */
 5267 static struct perf_event *
 5268 inherit_event(struct perf_event *parent_event,
 5269               struct task_struct *parent,
 5270               struct perf_event_context *parent_ctx,
 5271               struct task_struct *child,
 5272               struct perf_event *group_leader,
 5273               struct perf_event_context *child_ctx)
 5274 {
 5275         struct perf_event *child_event;
 5276 
 5277         /*
 5278          * Instead of creating recursive hierarchies of events,
 5279          * we link inherited events back to the original parent,
 5280          * which has a filp for sure, which we use as the reference
 5281          * count:
 5282          */
 5283         if (parent_event->parent)
 5284                 parent_event = parent_event->parent;
 5285 
 5286         child_event = perf_event_alloc(&parent_event->attr,
 5287                                            parent_event->cpu, child_ctx,
 5288                                            group_leader, parent_event,
 5289                                            NULL, GFP_KERNEL);
 5290         if (IS_ERR(child_event))
 5291                 return child_event;
 5292         get_ctx(child_ctx);
 5293 
 5294         /*
 5295          * Make the child state follow the state of the parent event,
 5296          * not its attr.disabled bit.  We hold the parent's mutex,
 5297          * so we won't race with perf_event_{en, dis}able_family.
 5298          */
 5299         if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
 5300                 child_event->state = PERF_EVENT_STATE_INACTIVE;
 5301         else
 5302                 child_event->state = PERF_EVENT_STATE_OFF;
 5303 
 5304         if (parent_event->attr.freq) {
 5305                 u64 sample_period = parent_event->hw.sample_period;
 5306                 struct hw_perf_event *hwc = &child_event->hw;
 5307 
 5308                 hwc->sample_period = sample_period;
 5309                 hwc->last_period   = sample_period;
 5310 
 5311                 local64_set(&hwc->period_left, sample_period);
 5312         }
 5313 
 5314         child_event->overflow_handler = parent_event->overflow_handler;
 5315 
 5316         /*
 5317          * Link it up in the child's context:
 5318          */
 5319         add_event_to_ctx(child_event, child_ctx);
 5320 
 5321         /*
 5322          * Get a reference to the parent filp - we will fput it
 5323          * when the child event exits. This is safe to do because
 5324          * we are in the parent and we know that the filp still
 5325          * exists and has a nonzero count:
 5326          */
 5327         atomic_long_inc(&parent_event->filp->f_count);
 5328 
 5329         /*
 5330          * Link this into the parent event's child list
 5331          */
 5332         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
 5333         mutex_lock(&parent_event->child_mutex);
 5334         list_add_tail(&child_event->child_list, &parent_event->child_list);
 5335         mutex_unlock(&parent_event->child_mutex);
 5336 
 5337         return child_event;
 5338 }
 5339 
 5340 static int inherit_group(struct perf_event *parent_event,
 5341               struct task_struct *parent,
 5342               struct perf_event_context *parent_ctx,
 5343               struct task_struct *child,
 5344               struct perf_event_context *child_ctx)
 5345 {
 5346         struct perf_event *leader;
 5347         struct perf_event *sub;
 5348         struct perf_event *child_ctr;
 5349 
 5350         leader = inherit_event(parent_event, parent, parent_ctx,
 5351                                  child, NULL, child_ctx);
 5352         if (IS_ERR(leader))
 5353                 return PTR_ERR(leader);
 5354         list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
 5355                 child_ctr = inherit_event(sub, parent, parent_ctx,
 5356                                             child, leader, child_ctx);
 5357                 if (IS_ERR(child_ctr))
 5358                         return PTR_ERR(child_ctr);
 5359         }
 5360         return 0;
 5361 }
 5362 
 5363 static void sync_child_event(struct perf_event *child_event,
 5364                                struct task_struct *child)
 5365 {
 5366         struct perf_event *parent_event = child_event->parent;
 5367         u64 child_val;
 5368 
 5369         if (child_event->attr.inherit_stat)
 5370                 perf_event_read_event(child_event, child);
 5371 
 5372         child_val = perf_event_count(child_event);
 5373 
 5374         /*
 5375          * Add back the child's count to the parent's count:
 5376          */
 5377         atomic64_add(child_val, &parent_event->child_count);
 5378         atomic64_add(child_event->total_time_enabled,
 5379                      &parent_event->child_total_time_enabled);
 5380         atomic64_add(child_event->total_time_running,
 5381                      &parent_event->child_total_time_running);
 5382 
 5383         /*
 5384          * Remove this event from the parent's list
 5385          */
 5386         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
 5387         mutex_lock(&parent_event->child_mutex);
 5388         list_del_init(&child_event->child_list);
 5389         mutex_unlock(&parent_event->child_mutex);
 5390 
 5391         /*
 5392          * Release the parent event, if this was the last
 5393          * reference to it.
 5394          */
 5395         fput(parent_event->filp);
 5396 }
 5397 
 5398 static void
 5399 __perf_event_exit_task(struct perf_event *child_event,
 5400                          struct perf_event_context *child_ctx,
 5401                          struct task_struct *child)
 5402 {
 5403         struct perf_event *parent_event;
 5404 
 5405         perf_event_remove_from_context(child_event);
 5406 
 5407         parent_event = child_event->parent;
 5408         /*
 5409          * It can happen that parent exits first, and has events
 5410          * that are still around due to the child reference. These
 5411          * events need to be zapped - but otherwise linger.
 5412          */
 5413         if (parent_event) {
 5414                 sync_child_event(child_event, child);
 5415                 free_event(child_event);
 5416         }
 5417 }
 5418 
 5419 /*
 5420  * When a child task exits, feed back event values to parent events.
 5421  */
 5422 void perf_event_exit_task(struct task_struct *child)
 5423 {
 5424         struct perf_event *child_event, *tmp;
 5425         struct perf_event_context *child_ctx;
 5426         unsigned long flags;
 5427 
 5428         if (likely(!child->perf_event_ctxp)) {
 5429                 perf_event_task(child, NULL, 0);
 5430                 return;
 5431         }
 5432 
 5433         local_irq_save(flags);
 5434         /*
 5435          * We can't reschedule here because interrupts are disabled,
 5436          * and either child is current or it is a task that can't be
 5437          * scheduled, so we are now safe from rescheduling changing
 5438          * our context.
 5439          */
 5440         child_ctx = child->perf_event_ctxp;
 5441         __perf_event_task_sched_out(child_ctx);
 5442 
 5443         /*
 5444          * Take the context lock here so that if find_get_context is
 5445          * reading child->perf_event_ctxp, we wait until it has
 5446          * incremented the context's refcount before we do put_ctx below.
 5447          */
 5448         raw_spin_lock(&child_ctx->lock);
 5449         child->perf_event_ctxp = NULL;
 5450         /*
 5451          * If this context is a clone; unclone it so it can't get
 5452          * swapped to another process while we're removing all
 5453          * the events from it.
 5454          */
 5455         unclone_ctx(child_ctx);
 5456         update_context_time(child_ctx);
 5457         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
 5458 
 5459         /*
 5460          * Report the task dead after unscheduling the events so that we
 5461          * won't get any samples after PERF_RECORD_EXIT. We can however still
 5462          * get a few PERF_RECORD_READ events.
 5463          */
 5464         perf_event_task(child, child_ctx, 0);
 5465 
 5466         /*
 5467          * We can recurse on the same lock type through:
 5468          *
 5469          *   __perf_event_exit_task()
 5470          *     sync_child_event()
 5471          *       fput(parent_event->filp)
 5472          *         perf_release()
 5473          *           mutex_lock(&ctx->mutex)
 5474          *
 5475          * But since its the parent context it won't be the same instance.
 5476          */
 5477         mutex_lock(&child_ctx->mutex);
 5478 
 5479 again:
 5480         list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
 5481                                  group_entry)
 5482                 __perf_event_exit_task(child_event, child_ctx, child);
 5483 
 5484         list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
 5485                                  group_entry)
 5486                 __perf_event_exit_task(child_event, child_ctx, child);
 5487 
 5488         /*
 5489          * If the last event was a group event, it will have appended all
 5490          * its siblings to the list, but we obtained 'tmp' before that which
 5491          * will still point to the list head terminating the iteration.
 5492          */
 5493         if (!list_empty(&child_ctx->pinned_groups) ||
 5494             !list_empty(&child_ctx->flexible_groups))
 5495                 goto again;
 5496 
 5497         mutex_unlock(&child_ctx->mutex);
 5498 
 5499         put_ctx(child_ctx);
 5500 }
 5501 
 5502 static void perf_free_event(struct perf_event *event,
 5503                             struct perf_event_context *ctx)
 5504 {
 5505         struct perf_event *parent = event->parent;
 5506 
 5507         if (WARN_ON_ONCE(!parent))
 5508                 return;
 5509 
 5510         mutex_lock(&parent->child_mutex);
 5511         list_del_init(&event->child_list);
 5512         mutex_unlock(&parent->child_mutex);
 5513 
 5514         fput(parent->filp);
 5515 
 5516         perf_group_detach(event);
 5517         list_del_event(event, ctx);
 5518         free_event(event);
 5519 }
 5520 
 5521 /*
 5522  * free an unexposed, unused context as created by inheritance by
 5523  * init_task below, used by fork() in case of fail.
 5524  */
 5525 void perf_event_free_task(struct task_struct *task)
 5526 {
 5527         struct perf_event_context *ctx = task->perf_event_ctxp;
 5528         struct perf_event *event, *tmp;
 5529 
 5530         if (!ctx)
 5531                 return;
 5532 
 5533         mutex_lock(&ctx->mutex);
 5534 again:
 5535         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
 5536                 perf_free_event(event, ctx);
 5537 
 5538         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
 5539                                  group_entry)
 5540                 perf_free_event(event, ctx);
 5541 
 5542         if (!list_empty(&ctx->pinned_groups) ||
 5543             !list_empty(&ctx->flexible_groups))
 5544                 goto again;
 5545 
 5546         mutex_unlock(&ctx->mutex);
 5547 
 5548         put_ctx(ctx);
 5549 }
 5550 
 5551 static int
 5552 inherit_task_group(struct perf_event *event, struct task_struct *parent,
 5553                    struct perf_event_context *parent_ctx,
 5554                    struct task_struct *child,
 5555                    int *inherited_all)
 5556 {
 5557         int ret;
 5558         struct perf_event_context *child_ctx = child->perf_event_ctxp;
 5559 
 5560         if (!event->attr.inherit) {
 5561                 *inherited_all = 0;
 5562                 return 0;
 5563         }
 5564 
 5565         if (!child_ctx) {
 5566                 /*
 5567                  * This is executed from the parent task context, so
 5568                  * inherit events that have been marked for cloning.
 5569                  * First allocate and initialize a context for the
 5570                  * child.
 5571                  */
 5572 
 5573                 child_ctx = kzalloc(sizeof(struct perf_event_context),
 5574                                     GFP_KERNEL);
 5575                 if (!child_ctx)
 5576                         return -ENOMEM;
 5577 
 5578                 __perf_event_init_context(child_ctx, child);
 5579                 child->perf_event_ctxp = child_ctx;
 5580                 get_task_struct(child);
 5581         }
 5582 
 5583         ret = inherit_group(event, parent, parent_ctx,
 5584                             child, child_ctx);
 5585 
 5586         if (ret)
 5587                 *inherited_all = 0;
 5588 
 5589         return ret;
 5590 }
 5591 
 5592 
 5593 /*
 5594  * Initialize the perf_event context in task_struct
 5595  */
 5596 int perf_event_init_task(struct task_struct *child)
 5597 {
 5598         struct perf_event_context *child_ctx, *parent_ctx;
 5599         struct perf_event_context *cloned_ctx;
 5600         struct perf_event *event;
 5601         struct task_struct *parent = current;
 5602         int inherited_all = 1;
 5603         int ret = 0;
 5604 
 5605         child->perf_event_ctxp = NULL;
 5606 
 5607         mutex_init(&child->perf_event_mutex);
 5608         INIT_LIST_HEAD(&child->perf_event_list);
 5609 
 5610         if (likely(!parent->perf_event_ctxp))
 5611                 return 0;
 5612 
 5613         /*
 5614          * If the parent's context is a clone, pin it so it won't get
 5615          * swapped under us.
 5616          */
 5617         parent_ctx = perf_pin_task_context(parent);
 5618 
 5619         /*
 5620          * No need to check if parent_ctx != NULL here; since we saw
 5621          * it non-NULL earlier, the only reason for it to become NULL
 5622          * is if we exit, and since we're currently in the middle of
 5623          * a fork we can't be exiting at the same time.
 5624          */
 5625 
 5626         /*
 5627          * Lock the parent list. No need to lock the child - not PID
 5628          * hashed yet and not running, so nobody can access it.
 5629          */
 5630         mutex_lock(&parent_ctx->mutex);
 5631 
 5632         /*
 5633          * We dont have to disable NMIs - we are only looking at
 5634          * the list, not manipulating it:
 5635          */
 5636         list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
 5637                 ret = inherit_task_group(event, parent, parent_ctx, child,
 5638                                          &inherited_all);
 5639                 if (ret)
 5640                         break;
 5641         }
 5642 
 5643         list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
 5644                 ret = inherit_task_group(event, parent, parent_ctx, child,
 5645                                          &inherited_all);
 5646                 if (ret)
 5647                         break;
 5648         }
 5649 
 5650         child_ctx = child->perf_event_ctxp;
 5651 
 5652         if (child_ctx && inherited_all) {
 5653                 /*
 5654                  * Mark the child context as a clone of the parent
 5655                  * context, or of whatever the parent is a clone of.
 5656                  * Note that if the parent is a clone, it could get
 5657                  * uncloned at any point, but that doesn't matter
 5658                  * because the list of events and the generation
 5659                  * count can't have changed since we took the mutex.
 5660                  */
 5661                 cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
 5662                 if (cloned_ctx) {
 5663                         child_ctx->parent_ctx = cloned_ctx;
 5664                         child_ctx->parent_gen = parent_ctx->parent_gen;
 5665                 } else {
 5666                         child_ctx->parent_ctx = parent_ctx;
 5667                         child_ctx->parent_gen = parent_ctx->generation;
 5668                 }
 5669                 get_ctx(child_ctx->parent_ctx);
 5670         }
 5671 
 5672         mutex_unlock(&parent_ctx->mutex);
 5673 
 5674         perf_unpin_context(parent_ctx);
 5675 
 5676         return ret;
 5677 }
 5678 
 5679 static void __init perf_event_init_all_cpus(void)
 5680 {
 5681         int cpu;
 5682         struct perf_cpu_context *cpuctx;
 5683 
 5684         for_each_possible_cpu(cpu) {
 5685                 cpuctx = &per_cpu(perf_cpu_context, cpu);
 5686                 mutex_init(&cpuctx->hlist_mutex);
 5687                 __perf_event_init_context(&cpuctx->ctx, NULL);
 5688         }
 5689 }
 5690 
 5691 static void __cpuinit perf_event_init_cpu(int cpu)
 5692 {
 5693         struct perf_cpu_context *cpuctx;
 5694 
 5695         cpuctx = &per_cpu(perf_cpu_context, cpu);
 5696 
 5697         spin_lock(&perf_resource_lock);
 5698         cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
 5699         spin_unlock(&perf_resource_lock);
 5700 
 5701         mutex_lock(&cpuctx->hlist_mutex);
 5702         if (cpuctx->hlist_refcount > 0) {
 5703                 struct swevent_hlist *hlist;
 5704 
 5705                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
 5706                 WARN_ON_ONCE(!hlist);
 5707                 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
 5708         }
 5709         mutex_unlock(&cpuctx->hlist_mutex);
 5710 }
 5711 
 5712 #ifdef CONFIG_HOTPLUG_CPU
 5713 static void __perf_event_exit_cpu(void *info)
 5714 {
 5715         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 5716         struct perf_event_context *ctx = &cpuctx->ctx;
 5717         struct perf_event *event, *tmp;
 5718 
 5719         list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
 5720                 __perf_event_remove_from_context(event);
 5721         list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
 5722                 __perf_event_remove_from_context(event);
 5723 }
 5724 static void perf_event_exit_cpu(int cpu)
 5725 {
 5726         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 5727         struct perf_event_context *ctx = &cpuctx->ctx;
 5728 
 5729         mutex_lock(&cpuctx->hlist_mutex);
 5730         swevent_hlist_release(cpuctx);
 5731         mutex_unlock(&cpuctx->hlist_mutex);
 5732 
 5733         mutex_lock(&ctx->mutex);
 5734         smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
 5735         mutex_unlock(&ctx->mutex);
 5736 }
 5737 #else
 5738 static inline void perf_event_exit_cpu(int cpu) { }
 5739 #endif
 5740 
 5741 static int __cpuinit
 5742 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 5743 {
 5744         unsigned int cpu = (long)hcpu;
 5745 
 5746         switch (action) {
 5747 
 5748         case CPU_UP_PREPARE:
 5749         case CPU_UP_PREPARE_FROZEN:
 5750                 perf_event_init_cpu(cpu);
 5751                 break;
 5752 
 5753         case CPU_DOWN_PREPARE:
 5754         case CPU_DOWN_PREPARE_FROZEN:
 5755                 perf_event_exit_cpu(cpu);
 5756                 break;
 5757 
 5758         default:
 5759                 break;
 5760         }
 5761 
 5762         return NOTIFY_OK;
 5763 }
 5764 
 5765 /*
 5766  * This has to have a higher priority than migration_notifier in sched.c.
 5767  */
 5768 static struct notifier_block __cpuinitdata perf_cpu_nb = {
 5769         .notifier_call          = perf_cpu_notify,
 5770         .priority               = 20,
 5771 };
 5772 
 5773 void __init perf_event_init(void)
 5774 {
 5775         perf_event_init_all_cpus();
 5776         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 5777                         (void *)(long)smp_processor_id());
 5778         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
 5779                         (void *)(long)smp_processor_id());
 5780         register_cpu_notifier(&perf_cpu_nb);
 5781 }
 5782 
 5783 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
 5784                                         struct sysdev_class_attribute *attr,
 5785                                         char *buf)
 5786 {
 5787         return sprintf(buf, "%d\n", perf_reserved_percpu);
 5788 }
 5789 
 5790 static ssize_t
 5791 perf_set_reserve_percpu(struct sysdev_class *class,
 5792                         struct sysdev_class_attribute *attr,
 5793                         const char *buf,
 5794                         size_t count)
 5795 {
 5796         struct perf_cpu_context *cpuctx;
 5797         unsigned long val;
 5798         int err, cpu, mpt;
 5799 
 5800         err = strict_strtoul(buf, 10, &val);
 5801         if (err)
 5802                 return err;
 5803         if (val > perf_max_events)
 5804                 return -EINVAL;
 5805 
 5806         spin_lock(&perf_resource_lock);
 5807         perf_reserved_percpu = val;
 5808         for_each_online_cpu(cpu) {
 5809                 cpuctx = &per_cpu(perf_cpu_context, cpu);
 5810                 raw_spin_lock_irq(&cpuctx->ctx.lock);
 5811                 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
 5812                           perf_max_events - perf_reserved_percpu);
 5813                 cpuctx->max_pertask = mpt;
 5814                 raw_spin_unlock_irq(&cpuctx->ctx.lock);
 5815         }
 5816         spin_unlock(&perf_resource_lock);
 5817 
 5818         return count;
 5819 }
 5820 
 5821 static ssize_t perf_show_overcommit(struct sysdev_class *class,
 5822                                     struct sysdev_class_attribute *attr,
 5823                                     char *buf)
 5824 {
 5825         return sprintf(buf, "%d\n", perf_overcommit);
 5826 }
 5827 
 5828 static ssize_t
 5829 perf_set_overcommit(struct sysdev_class *class,
 5830                     struct sysdev_class_attribute *attr,
 5831                     const char *buf, size_t count)
 5832 {
 5833         unsigned long val;
 5834         int err;
 5835 
 5836         err = strict_strtoul(buf, 10, &val);
 5837         if (err)
 5838                 return err;
 5839         if (val > 1)
 5840                 return -EINVAL;
 5841 
 5842         spin_lock(&perf_resource_lock);
 5843         perf_overcommit = val;
 5844         spin_unlock(&perf_resource_lock);
 5845 
 5846         return count;
 5847 }
 5848 
 5849 static SYSDEV_CLASS_ATTR(
 5850                                 reserve_percpu,
 5851                                 0644,
 5852                                 perf_show_reserve_percpu,
 5853                                 perf_set_reserve_percpu
 5854                         );
 5855 
 5856 static SYSDEV_CLASS_ATTR(
 5857                                 overcommit,
 5858                                 0644,
 5859                                 perf_show_overcommit,
 5860                                 perf_set_overcommit
 5861                         );
 5862 
 5863 static struct attribute *perfclass_attrs[] = {
 5864         &attr_reserve_percpu.attr,
 5865         &attr_overcommit.attr,
 5866         NULL
 5867 };
 5868 
 5869 static struct attribute_group perfclass_attr_group = {
 5870         .attrs                  = perfclass_attrs,
 5871         .name                   = "perf_events",
 5872 };
 5873 
 5874 static int __init perf_event_sysfs_init(void)
 5875 {
 5876         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
 5877                                   &perfclass_attr_group);
 5878 }
 5879 device_initcall(perf_event_sysfs_init);

Cache object: d139c7b65fa112f9504c69ff25d71293


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.