kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *
   8  *  For licensing details see kernel-base/COPYING
   9  */
  10
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/cpu.h>
  14 #include <linux/smp.h>
  15 #include <linux/file.h>
  16 #include <linux/poll.h>
  17 #include <linux/sysfs.h>
  18 #include <linux/ptrace.h>
  19 #include <linux/percpu.h>
  20 #include <linux/vmstat.h>
  21 #include <linux/hardirq.h>
  22 #include <linux/rculist.h>
  23 #include <linux/uaccess.h>
  24 #include <linux/syscalls.h>
  25 #include <linux/anon_inodes.h>
  26 #include <linux/kernel_stat.h>
  27 #include <linux/perf_counter.h>
  28 #include <linux/dcache.h>
  29
  30 #include <asm/irq_regs.h>
  31
  32 /*
  33  * Each CPU has a list of per CPU counters:
  34  */
  35 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  36
  37 int perf_max_counters __read_mostly = 1;
  38 static int perf_reserved_percpu __read_mostly;
  39 static int perf_overcommit __read_mostly = 1;
  40
  41 /*
  42  * Mutex for (sysadmin-configurable) counter reservations:
  43  */
  44 static DEFINE_MUTEX(perf_resource_mutex);
  45
  46 /*
  47  * Architecture provided APIs - weak aliases:
  48  */
  49 extern __weak const struct hw_perf_counter_ops *
  50 hw_perf_counter_init(struct perf_counter *counter)
  51 {
  52         return NULL;
  53 }
  54
  55 u64 __weak hw_perf_save_disable(void)           { return 0; }
  56 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  57 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  58 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  59                struct perf_cpu_context *cpuctx,
  60                struct perf_counter_context *ctx, int cpu)
  61 {
  62         return 0;
  63 }
  64
  65 void __weak perf_counter_print_debug(void)      { }
  66
  67 static void
  68 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  69 {
  70         struct perf_counter *group_leader = counter->group_leader;
  71
  72         /*
  73          * Depending on whether it is a standalone or sibling counter,
  74          * add it straight to the context's counter list, or to the group
  75          * leader's sibling list:
  76          */
  77         if (counter->group_leader == counter)
  78                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  79         else {
  80                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  81                 group_leader->nr_siblings++;
  82         }
  83
  84         list_add_rcu(&counter->event_entry, &ctx->event_list);
  85 }
  86
  87 static void
  88 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  89 {
  90         struct perf_counter *sibling, *tmp;
  91
  92         list_del_init(&counter->list_entry);
  93         list_del_rcu(&counter->event_entry);
  94
  95         if (counter->group_leader != counter)
  96                 counter->group_leader->nr_siblings--;
  97
  98         /*
  99          * If this was a group counter with sibling counters then
 100          * upgrade the siblings to singleton counters by adding them
 101          * to the context list directly:
 102          */
 103         list_for_each_entry_safe(sibling, tmp,
 104                                  &counter->sibling_list, list_entry) {
 105
 106                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
 107                 sibling->group_leader = sibling;
 108         }
 109 }
 110
 111 static void
 112 counter_sched_out(struct perf_counter *counter,
 113                   struct perf_cpu_context *cpuctx,
 114                   struct perf_counter_context *ctx)
 115 {
 116         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 117                 return;
 118
 119         counter->state = PERF_COUNTER_STATE_INACTIVE;
 120         counter->tstamp_stopped = ctx->time;
 121         counter->hw_ops->disable(counter);
 122         counter->oncpu = -1;
 123
 124         if (!is_software_counter(counter))
 125                 cpuctx->active_oncpu--;
 126         ctx->nr_active--;
 127         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 128                 cpuctx->exclusive = 0;
 129 }
 130
 131 static void
 132 group_sched_out(struct perf_counter *group_counter,
 133                 struct perf_cpu_context *cpuctx,
 134                 struct perf_counter_context *ctx)
 135 {
 136         struct perf_counter *counter;
 137
 138         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 139                 return;
 140
 141         counter_sched_out(group_counter, cpuctx, ctx);
 142
 143         /*
 144          * Schedule out siblings (if any):
 145          */
 146         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 147                 counter_sched_out(counter, cpuctx, ctx);
 148
 149         if (group_counter->hw_event.exclusive)
 150                 cpuctx->exclusive = 0;
 151 }
 152
 153 /*
 154  * Cross CPU call to remove a performance counter
 155  *
 156  * We disable the counter on the hardware level first. After that we
 157  * remove it from the context list.
 158  */
 159 static void __perf_counter_remove_from_context(void *info)
 160 {
 161         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 162         struct perf_counter *counter = info;
 163         struct perf_counter_context *ctx = counter->ctx;
 164         unsigned long flags;
 165         u64 perf_flags;
 166
 167         /*
 168          * If this is a task context, we need to check whether it is
 169          * the current task context of this cpu. If not it has been
 170          * scheduled out before the smp call arrived.
 171          */
 172         if (ctx->task && cpuctx->task_ctx != ctx)
 173                 return;
 174
 175         curr_rq_lock_irq_save(&flags);
 176         spin_lock(&ctx->lock);
 177
 178         counter_sched_out(counter, cpuctx, ctx);
 179
 180         counter->task = NULL;
 181         ctx->nr_counters--;
 182
 183         /*
 184          * Protect the list operation against NMI by disabling the
 185          * counters on a global level. NOP for non NMI based counters.
 186          */
 187         perf_flags = hw_perf_save_disable();
 188         list_del_counter(counter, ctx);
 189         hw_perf_restore(perf_flags);
 190
 191         if (!ctx->task) {
 192                 /*
 193                  * Allow more per task counters with respect to the
 194                  * reservation:
 195                  */
 196                 cpuctx->max_pertask =
 197                         min(perf_max_counters - ctx->nr_counters,
 198                             perf_max_counters - perf_reserved_percpu);
 199         }
 200
 201         spin_unlock(&ctx->lock);
 202         curr_rq_unlock_irq_restore(&flags);
 203 }
 204
 205
 206 /*
 207  * Remove the counter from a task's (or a CPU's) list of counters.
 208  *
 209  * Must be called with counter->mutex and ctx->mutex held.
 210  *
 211  * CPU counters are removed with a smp call. For task counters we only
 212  * call when the task is on a CPU.
 213  */
 214 static void perf_counter_remove_from_context(struct perf_counter *counter)
 215 {
 216         struct perf_counter_context *ctx = counter->ctx;
 217         struct task_struct *task = ctx->task;
 218
 219         if (!task) {
 220                 /*
 221                  * Per cpu counters are removed via an smp call and
 222                  * the removal is always sucessful.
 223                  */
 224                 smp_call_function_single(counter->cpu,
 225                                          __perf_counter_remove_from_context,
 226                                          counter, 1);
 227                 return;
 228         }
 229
 230 retry:
 231         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 232                                  counter);
 233
 234         spin_lock_irq(&ctx->lock);
 235         /*
 236          * If the context is active we need to retry the smp call.
 237          */
 238         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 239                 spin_unlock_irq(&ctx->lock);
 240                 goto retry;
 241         }
 242
 243         /*
 244          * The lock prevents that this context is scheduled in so we
 245          * can remove the counter safely, if the call above did not
 246          * succeed.
 247          */
 248         if (!list_empty(&counter->list_entry)) {
 249                 ctx->nr_counters--;
 250                 list_del_counter(counter, ctx);
 251                 counter->task = NULL;
 252         }
 253         spin_unlock_irq(&ctx->lock);
 254 }
 255
 256 static inline u64 perf_clock(void)
 257 {
 258         return cpu_clock(smp_processor_id());
 259 }
 260
 261 /*
 262  * Update the record of the current time in a context.
 263  */
 264 static void update_context_time(struct perf_counter_context *ctx)
 265 {
 266         u64 now = perf_clock();
 267
 268         ctx->time += now - ctx->timestamp;
 269         ctx->timestamp = now;
 270 }
 271
 272 /*
 273  * Update the total_time_enabled and total_time_running fields for a counter.
 274  */
 275 static void update_counter_times(struct perf_counter *counter)
 276 {
 277         struct perf_counter_context *ctx = counter->ctx;
 278         u64 run_end;
 279
 280         if (counter->state < PERF_COUNTER_STATE_INACTIVE)
 281                 return;
 282
 283         counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
 284
 285         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 286                 run_end = counter->tstamp_stopped;
 287         else
 288                 run_end = ctx->time;
 289
 290         counter->total_time_running = run_end - counter->tstamp_running;
 291 }
 292
 293 /*
 294  * Update total_time_enabled and total_time_running for all counters in a group.
 295  */
 296 static void update_group_times(struct perf_counter *leader)
 297 {
 298         struct perf_counter *counter;
 299
 300         update_counter_times(leader);
 301         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 302                 update_counter_times(counter);
 303 }
 304
 305 /*
 306  * Cross CPU call to disable a performance counter
 307  */
 308 static void __perf_counter_disable(void *info)
 309 {
 310         struct perf_counter *counter = info;
 311         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 312         struct perf_counter_context *ctx = counter->ctx;
 313         unsigned long flags;
 314
 315         /*
 316          * If this is a per-task counter, need to check whether this
 317          * counter's task is the current task on this cpu.
 318          */
 319         if (ctx->task && cpuctx->task_ctx != ctx)
 320                 return;
 321
 322         curr_rq_lock_irq_save(&flags);
 323         spin_lock(&ctx->lock);
 324
 325         /*
 326          * If the counter is on, turn it off.
 327          * If it is in error state, leave it in error state.
 328          */
 329         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 330                 update_context_time(ctx);
 331                 update_counter_times(counter);
 332                 if (counter == counter->group_leader)
 333                         group_sched_out(counter, cpuctx, ctx);
 334                 else
 335                         counter_sched_out(counter, cpuctx, ctx);
 336                 counter->state = PERF_COUNTER_STATE_OFF;
 337         }
 338
 339         spin_unlock(&ctx->lock);
 340         curr_rq_unlock_irq_restore(&flags);
 341 }
 342
 343 /*
 344  * Disable a counter.
 345  */
 346 static void perf_counter_disable(struct perf_counter *counter)
 347 {
 348         struct perf_counter_context *ctx = counter->ctx;
 349         struct task_struct *task = ctx->task;
 350
 351         if (!task) {
 352                 /*
 353                  * Disable the counter on the cpu that it's on
 354                  */
 355                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 356                                          counter, 1);
 357                 return;
 358         }
 359
 360  retry:
 361         task_oncpu_function_call(task, __perf_counter_disable, counter);
 362
 363         spin_lock_irq(&ctx->lock);
 364         /*
 365          * If the counter is still active, we need to retry the cross-call.
 366          */
 367         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 368                 spin_unlock_irq(&ctx->lock);
 369                 goto retry;
 370         }
 371
 372         /*
 373          * Since we have the lock this context can't be scheduled
 374          * in, so we can change the state safely.
 375          */
 376         if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 377                 update_counter_times(counter);
 378                 counter->state = PERF_COUNTER_STATE_OFF;
 379         }
 380
 381         spin_unlock_irq(&ctx->lock);
 382 }
 383
 384 /*
 385  * Disable a counter and all its children.
 386  */
 387 static void perf_counter_disable_family(struct perf_counter *counter)
 388 {
 389         struct perf_counter *child;
 390
 391         perf_counter_disable(counter);
 392
 393         /*
 394          * Lock the mutex to protect the list of children
 395          */
 396         mutex_lock(&counter->mutex);
 397         list_for_each_entry(child, &counter->child_list, child_list)
 398                 perf_counter_disable(child);
 399         mutex_unlock(&counter->mutex);
 400 }
 401
 402 static int
 403 counter_sched_in(struct perf_counter *counter,
 404                  struct perf_cpu_context *cpuctx,
 405                  struct perf_counter_context *ctx,
 406                  int cpu)
 407 {
 408         if (counter->state <= PERF_COUNTER_STATE_OFF)
 409                 return 0;
 410
 411         counter->state = PERF_COUNTER_STATE_ACTIVE;
 412         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 413         /*
 414          * The new state must be visible before we turn it on in the hardware:
 415          */
 416         smp_wmb();
 417
 418         if (counter->hw_ops->enable(counter)) {
 419                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 420                 counter->oncpu = -1;
 421                 return -EAGAIN;
 422         }
 423
 424         counter->tstamp_running += ctx->time - counter->tstamp_stopped;
 425
 426         if (!is_software_counter(counter))
 427                 cpuctx->active_oncpu++;
 428         ctx->nr_active++;
 429
 430         if (counter->hw_event.exclusive)
 431                 cpuctx->exclusive = 1;
 432
 433         return 0;
 434 }
 435
 436 /*
 437  * Return 1 for a group consisting entirely of software counters,
 438  * 0 if the group contains any hardware counters.
 439  */
 440 static int is_software_only_group(struct perf_counter *leader)
 441 {
 442         struct perf_counter *counter;
 443
 444         if (!is_software_counter(leader))
 445                 return 0;
 446
 447         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 448                 if (!is_software_counter(counter))
 449                         return 0;
 450
 451         return 1;
 452 }
 453
 454 /*
 455  * Work out whether we can put this counter group on the CPU now.
 456  */
 457 static int group_can_go_on(struct perf_counter *counter,
 458                            struct perf_cpu_context *cpuctx,
 459                            int can_add_hw)
 460 {
 461         /*
 462          * Groups consisting entirely of software counters can always go on.
 463          */
 464         if (is_software_only_group(counter))
 465                 return 1;
 466         /*
 467          * If an exclusive group is already on, no other hardware
 468          * counters can go on.
 469          */
 470         if (cpuctx->exclusive)
 471                 return 0;
 472         /*
 473          * If this group is exclusive and there are already
 474          * counters on the CPU, it can't go on.
 475          */
 476         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 477                 return 0;
 478         /*
 479          * Otherwise, try to add it if all previous groups were able
 480          * to go on.
 481          */
 482         return can_add_hw;
 483 }
 484
 485 static void add_counter_to_ctx(struct perf_counter *counter,
 486                                struct perf_counter_context *ctx)
 487 {
 488         list_add_counter(counter, ctx);
 489         ctx->nr_counters++;
 490         counter->prev_state = PERF_COUNTER_STATE_OFF;
 491         counter->tstamp_enabled = ctx->time;
 492         counter->tstamp_running = ctx->time;
 493         counter->tstamp_stopped = ctx->time;
 494 }
 495
 496 /*
 497  * Cross CPU call to install and enable a performance counter
 498  */
 499 static void __perf_install_in_context(void *info)
 500 {
 501         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 502         struct perf_counter *counter = info;
 503         struct perf_counter_context *ctx = counter->ctx;
 504         struct perf_counter *leader = counter->group_leader;
 505         int cpu = smp_processor_id();
 506         unsigned long flags;
 507         u64 perf_flags;
 508         int err;
 509
 510         /*
 511          * If this is a task context, we need to check whether it is
 512          * the current task context of this cpu. If not it has been
 513          * scheduled out before the smp call arrived.
 514          */
 515         if (ctx->task && cpuctx->task_ctx != ctx)
 516                 return;
 517
 518         curr_rq_lock_irq_save(&flags);
 519         spin_lock(&ctx->lock);
 520         update_context_time(ctx);
 521
 522         /*
 523          * Protect the list operation against NMI by disabling the
 524          * counters on a global level. NOP for non NMI based counters.
 525          */
 526         perf_flags = hw_perf_save_disable();
 527
 528         add_counter_to_ctx(counter, ctx);
 529
 530         /*
 531          * Don't put the counter on if it is disabled or if
 532          * it is in a group and the group isn't on.
 533          */
 534         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 535             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 536                 goto unlock;
 537
 538         /*
 539          * An exclusive counter can't go on if there are already active
 540          * hardware counters, and no hardware counter can go on if there
 541          * is already an exclusive counter on.
 542          */
 543         if (!group_can_go_on(counter, cpuctx, 1))
 544                 err = -EEXIST;
 545         else
 546                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 547
 548         if (err) {
 549                 /*
 550                  * This counter couldn't go on.  If it is in a group
 551                  * then we have to pull the whole group off.
 552                  * If the counter group is pinned then put it in error state.
 553                  */
 554                 if (leader != counter)
 555                         group_sched_out(leader, cpuctx, ctx);
 556                 if (leader->hw_event.pinned) {
 557                         update_group_times(leader);
 558                         leader->state = PERF_COUNTER_STATE_ERROR;
 559                 }
 560         }
 561
 562         if (!err && !ctx->task && cpuctx->max_pertask)
 563                 cpuctx->max_pertask--;
 564
 565  unlock:
 566         hw_perf_restore(perf_flags);
 567
 568         spin_unlock(&ctx->lock);
 569         curr_rq_unlock_irq_restore(&flags);
 570 }
 571
 572 /*
 573  * Attach a performance counter to a context
 574  *
 575  * First we add the counter to the list with the hardware enable bit
 576  * in counter->hw_config cleared.
 577  *
 578  * If the counter is attached to a task which is on a CPU we use a smp
 579  * call to enable it in the task context. The task might have been
 580  * scheduled away, but we check this in the smp call again.
 581  *
 582  * Must be called with ctx->mutex held.
 583  */
 584 static void
 585 perf_install_in_context(struct perf_counter_context *ctx,
 586                         struct perf_counter *counter,
 587                         int cpu)
 588 {
 589         struct task_struct *task = ctx->task;
 590
 591         if (!task) {
 592                 /*
 593                  * Per cpu counters are installed via an smp call and
 594                  * the install is always sucessful.
 595                  */
 596                 smp_call_function_single(cpu, __perf_install_in_context,
 597                                          counter, 1);
 598                 return;
 599         }
 600
 601         counter->task = task;
 602 retry:
 603         task_oncpu_function_call(task, __perf_install_in_context,
 604                                  counter);
 605
 606         spin_lock_irq(&ctx->lock);
 607         /*
 608          * we need to retry the smp call.
 609          */
 610         if (ctx->is_active && list_empty(&counter->list_entry)) {
 611                 spin_unlock_irq(&ctx->lock);
 612                 goto retry;
 613         }
 614
 615         /*
 616          * The lock prevents that this context is scheduled in so we
 617          * can add the counter safely, if it the call above did not
 618          * succeed.
 619          */
 620         if (list_empty(&counter->list_entry))
 621                 add_counter_to_ctx(counter, ctx);
 622         spin_unlock_irq(&ctx->lock);
 623 }
 624
 625 /*
 626  * Cross CPU call to enable a performance counter
 627  */
 628 static void __perf_counter_enable(void *info)
 629 {
 630         struct perf_counter *counter = info;
 631         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 632         struct perf_counter_context *ctx = counter->ctx;
 633         struct perf_counter *leader = counter->group_leader;
 634         unsigned long flags;
 635         int err;
 636
 637         /*
 638          * If this is a per-task counter, need to check whether this
 639          * counter's task is the current task on this cpu.
 640          */
 641         if (ctx->task && cpuctx->task_ctx != ctx)
 642                 return;
 643
 644         curr_rq_lock_irq_save(&flags);
 645         spin_lock(&ctx->lock);
 646         update_context_time(ctx);
 647
 648         counter->prev_state = counter->state;
 649         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 650                 goto unlock;
 651         counter->state = PERF_COUNTER_STATE_INACTIVE;
 652         counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 653
 654         /*
 655          * If the counter is in a group and isn't the group leader,
 656          * then don't put it on unless the group is on.
 657          */
 658         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 659                 goto unlock;
 660
 661         if (!group_can_go_on(counter, cpuctx, 1))
 662                 err = -EEXIST;
 663         else
 664                 err = counter_sched_in(counter, cpuctx, ctx,
 665                                        smp_processor_id());
 666
 667         if (err) {
 668                 /*
 669                  * If this counter can't go on and it's part of a
 670                  * group, then the whole group has to come off.
 671                  */
 672                 if (leader != counter)
 673                         group_sched_out(leader, cpuctx, ctx);
 674                 if (leader->hw_event.pinned) {
 675                         update_group_times(leader);
 676                         leader->state = PERF_COUNTER_STATE_ERROR;
 677                 }
 678         }
 679
 680  unlock:
 681         spin_unlock(&ctx->lock);
 682         curr_rq_unlock_irq_restore(&flags);
 683 }
 684
 685 /*
 686  * Enable a counter.
 687  */
 688 static void perf_counter_enable(struct perf_counter *counter)
 689 {
 690         struct perf_counter_context *ctx = counter->ctx;
 691         struct task_struct *task = ctx->task;
 692
 693         if (!task) {
 694                 /*
 695                  * Enable the counter on the cpu that it's on
 696                  */
 697                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 698                                          counter, 1);
 699                 return;
 700         }
 701
 702         spin_lock_irq(&ctx->lock);
 703         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 704                 goto out;
 705
 706         /*
 707          * If the counter is in error state, clear that first.
 708          * That way, if we see the counter in error state below, we
 709          * know that it has gone back into error state, as distinct
 710          * from the task having been scheduled away before the
 711          * cross-call arrived.
 712          */
 713         if (counter->state == PERF_COUNTER_STATE_ERROR)
 714                 counter->state = PERF_COUNTER_STATE_OFF;
 715
 716  retry:
 717         spin_unlock_irq(&ctx->lock);
 718         task_oncpu_function_call(task, __perf_counter_enable, counter);
 719
 720         spin_lock_irq(&ctx->lock);
 721
 722         /*
 723          * If the context is active and the counter is still off,
 724          * we need to retry the cross-call.
 725          */
 726         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 727                 goto retry;
 728
 729         /*
 730          * Since we have the lock this context can't be scheduled
 731          * in, so we can change the state safely.
 732          */
 733         if (counter->state == PERF_COUNTER_STATE_OFF) {
 734                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 735                 counter->tstamp_enabled =
 736                         ctx->time - counter->total_time_enabled;
 737         }
 738  out:
 739         spin_unlock_irq(&ctx->lock);
 740 }
 741
 742 static void perf_counter_refresh(struct perf_counter *counter, int refresh)
 743 {
 744         atomic_add(refresh, &counter->event_limit);
 745         perf_counter_enable(counter);
 746 }
 747
 748 /*
 749  * Enable a counter and all its children.
 750  */
 751 static void perf_counter_enable_family(struct perf_counter *counter)
 752 {
 753         struct perf_counter *child;
 754
 755         perf_counter_enable(counter);
 756
 757         /*
 758          * Lock the mutex to protect the list of children
 759          */
 760         mutex_lock(&counter->mutex);
 761         list_for_each_entry(child, &counter->child_list, child_list)
 762                 perf_counter_enable(child);
 763         mutex_unlock(&counter->mutex);
 764 }
 765
 766 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 767                               struct perf_cpu_context *cpuctx)
 768 {
 769         struct perf_counter *counter;
 770         u64 flags;
 771
 772         spin_lock(&ctx->lock);
 773         ctx->is_active = 0;
 774         if (likely(!ctx->nr_counters))
 775                 goto out;
 776         update_context_time(ctx);
 777
 778         flags = hw_perf_save_disable();
 779         if (ctx->nr_active) {
 780                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 781                         group_sched_out(counter, cpuctx, ctx);
 782         }
 783         hw_perf_restore(flags);
 784  out:
 785         spin_unlock(&ctx->lock);
 786 }
 787
 788 /*
 789  * Called from scheduler to remove the counters of the current task,
 790  * with interrupts disabled.
 791  *
 792  * We stop each counter and update the counter value in counter->count.
 793  *
 794  * This does not protect us against NMI, but disable()
 795  * sets the disabled bit in the control field of counter _before_
 796  * accessing the counter control register. If a NMI hits, then it will
 797  * not restart the counter.
 798  */
 799 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 800 {
 801         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 802         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 803         struct pt_regs *regs;
 804
 805         if (likely(!cpuctx->task_ctx))
 806                 return;
 807
 808         regs = task_pt_regs(task);
 809         perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 810         __perf_counter_sched_out(ctx, cpuctx);
 811
 812         cpuctx->task_ctx = NULL;
 813 }
 814
 815 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 816 {
 817         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 818 }
 819
 820 static int
 821 group_sched_in(struct perf_counter *group_counter,
 822                struct perf_cpu_context *cpuctx,
 823                struct perf_counter_context *ctx,
 824                int cpu)
 825 {
 826         struct perf_counter *counter, *partial_group;
 827         int ret;
 828
 829         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 830                 return 0;
 831
 832         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 833         if (ret)
 834                 return ret < 0 ? ret : 0;
 835
 836         group_counter->prev_state = group_counter->state;
 837         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 838                 return -EAGAIN;
 839
 840         /*
 841          * Schedule in siblings as one group (if any):
 842          */
 843         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 844                 counter->prev_state = counter->state;
 845                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 846                         partial_group = counter;
 847                         goto group_error;
 848                 }
 849         }
 850
 851         return 0;
 852
 853 group_error:
 854         /*
 855          * Groups can be scheduled in as one unit only, so undo any
 856          * partial group before returning:
 857          */
 858         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 859                 if (counter == partial_group)
 860                         break;
 861                 counter_sched_out(counter, cpuctx, ctx);
 862         }
 863         counter_sched_out(group_counter, cpuctx, ctx);
 864
 865         return -EAGAIN;
 866 }
 867
 868 static void
 869 __perf_counter_sched_in(struct perf_counter_context *ctx,
 870                         struct perf_cpu_context *cpuctx, int cpu)
 871 {
 872         struct perf_counter *counter;
 873         u64 flags;
 874         int can_add_hw = 1;
 875
 876         spin_lock(&ctx->lock);
 877         ctx->is_active = 1;
 878         if (likely(!ctx->nr_counters))
 879                 goto out;
 880
 881         ctx->timestamp = perf_clock();
 882
 883         flags = hw_perf_save_disable();
 884
 885         /*
 886          * First go through the list and put on any pinned groups
 887          * in order to give them the best chance of going on.
 888          */
 889         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 890                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 891                     !counter->hw_event.pinned)
 892                         continue;
 893                 if (counter->cpu != -1 && counter->cpu != cpu)
 894                         continue;
 895
 896                 if (group_can_go_on(counter, cpuctx, 1))
 897                         group_sched_in(counter, cpuctx, ctx, cpu);
 898
 899                 /*
 900                  * If this pinned group hasn't been scheduled,
 901                  * put it in error state.
 902                  */
 903                 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 904                         update_group_times(counter);
 905                         counter->state = PERF_COUNTER_STATE_ERROR;
 906                 }
 907         }
 908
 909         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 910                 /*
 911                  * Ignore counters in OFF or ERROR state, and
 912                  * ignore pinned counters since we did them already.
 913                  */
 914                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 915                     counter->hw_event.pinned)
 916                         continue;
 917
 918                 /*
 919                  * Listen to the 'cpu' scheduling filter constraint
 920                  * of counters:
 921                  */
 922                 if (counter->cpu != -1 && counter->cpu != cpu)
 923                         continue;
 924
 925                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 926                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 927                                 can_add_hw = 0;
 928                 }
 929         }
 930         hw_perf_restore(flags);
 931  out:
 932         spin_unlock(&ctx->lock);
 933 }
 934
 935 /*
 936  * Called from scheduler to add the counters of the current task
 937  * with interrupts disabled.
 938  *
 939  * We restore the counter value and then enable it.
 940  *
 941  * This does not protect us against NMI, but enable()
 942  * sets the enabled bit in the control field of counter _before_
 943  * accessing the counter control register. If a NMI hits, then it will
 944  * keep the counter running.
 945  */
 946 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 947 {
 948         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 949         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 950
 951         __perf_counter_sched_in(ctx, cpuctx, cpu);
 952         cpuctx->task_ctx = ctx;
 953 }
 954
 955 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 956 {
 957         struct perf_counter_context *ctx = &cpuctx->ctx;
 958
 959         __perf_counter_sched_in(ctx, cpuctx, cpu);
 960 }
 961
 962 int perf_counter_task_disable(void)
 963 {
 964         struct task_struct *curr = current;
 965         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 966         struct perf_counter *counter;
 967         unsigned long flags;
 968         u64 perf_flags;
 969         int cpu;
 970
 971         if (likely(!ctx->nr_counters))
 972                 return 0;
 973
 974         curr_rq_lock_irq_save(&flags);
 975         cpu = smp_processor_id();
 976
 977         /* force the update of the task clock: */
 978         __task_delta_exec(curr, 1);
 979
 980         perf_counter_task_sched_out(curr, cpu);
 981
 982         spin_lock(&ctx->lock);
 983
 984         /*
 985          * Disable all the counters:
 986          */
 987         perf_flags = hw_perf_save_disable();
 988
 989         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 990                 if (counter->state != PERF_COUNTER_STATE_ERROR) {
 991                         update_group_times(counter);
 992                         counter->state = PERF_COUNTER_STATE_OFF;
 993                 }
 994         }
 995
 996         hw_perf_restore(perf_flags);
 997
 998         spin_unlock(&ctx->lock);
 999
1000         curr_rq_unlock_irq_restore(&flags);
1001
1002         return 0;
1003 }
1004
1005 int perf_counter_task_enable(void)
1006 {
1007         struct task_struct *curr = current;
1008         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1009         struct perf_counter *counter;
1010         unsigned long flags;
1011         u64 perf_flags;
1012         int cpu;
1013
1014         if (likely(!ctx->nr_counters))
1015                 return 0;
1016
1017         curr_rq_lock_irq_save(&flags);
1018         cpu = smp_processor_id();
1019
1020         /* force the update of the task clock: */
1021         __task_delta_exec(curr, 1);
1022
1023         perf_counter_task_sched_out(curr, cpu);
1024
1025         spin_lock(&ctx->lock);
1026
1027         /*
1028          * Disable all the counters:
1029          */
1030         perf_flags = hw_perf_save_disable();
1031
1032         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1033                 if (counter->state > PERF_COUNTER_STATE_OFF)
1034                         continue;
1035                 counter->state = PERF_COUNTER_STATE_INACTIVE;
1036                 counter->tstamp_enabled =
1037                         ctx->time - counter->total_time_enabled;
1038                 counter->hw_event.disabled = 0;
1039         }
1040         hw_perf_restore(perf_flags);
1041
1042         spin_unlock(&ctx->lock);
1043
1044         perf_counter_task_sched_in(curr, cpu);
1045
1046         curr_rq_unlock_irq_restore(&flags);
1047
1048         return 0;
1049 }
1050
1051 /*
1052  * Round-robin a context's counters:
1053  */
1054 static void rotate_ctx(struct perf_counter_context *ctx)
1055 {
1056         struct perf_counter *counter;
1057         u64 perf_flags;
1058
1059         if (!ctx->nr_counters)
1060                 return;
1061
1062         spin_lock(&ctx->lock);
1063         /*
1064          * Rotate the first entry last (works just fine for group counters too):
1065          */
1066         perf_flags = hw_perf_save_disable();
1067         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1068                 list_move_tail(&counter->list_entry, &ctx->counter_list);
1069                 break;
1070         }
1071         hw_perf_restore(perf_flags);
1072
1073         spin_unlock(&ctx->lock);
1074 }
1075
1076 void perf_counter_task_tick(struct task_struct *curr, int cpu)
1077 {
1078         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1079         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1080         const int rotate_percpu = 0;
1081
1082         if (rotate_percpu)
1083                 perf_counter_cpu_sched_out(cpuctx);
1084         perf_counter_task_sched_out(curr, cpu);
1085
1086         if (rotate_percpu)
1087                 rotate_ctx(&cpuctx->ctx);
1088         rotate_ctx(ctx);
1089
1090         if (rotate_percpu)
1091                 perf_counter_cpu_sched_in(cpuctx, cpu);
1092         perf_counter_task_sched_in(curr, cpu);
1093 }
1094
1095 /*
1096  * Cross CPU call to read the hardware counter
1097  */
1098 static void __read(void *info)
1099 {
1100         struct perf_counter *counter = info;
1101         struct perf_counter_context *ctx = counter->ctx;
1102         unsigned long flags;
1103
1104         curr_rq_lock_irq_save(&flags);
1105         if (ctx->is_active)
1106                 update_context_time(ctx);
1107         counter->hw_ops->read(counter);
1108         update_counter_times(counter);
1109         curr_rq_unlock_irq_restore(&flags);
1110 }
1111
1112 static u64 perf_counter_read(struct perf_counter *counter)
1113 {
1114         /*
1115          * If counter is enabled and currently active on a CPU, update the
1116          * value in the counter structure:
1117          */
1118         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1119                 smp_call_function_single(counter->oncpu,
1120                                          __read, counter, 1);
1121         } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1122                 update_counter_times(counter);
1123         }
1124
1125         return atomic64_read(&counter->count);
1126 }
1127
1128 static void put_context(struct perf_counter_context *ctx)
1129 {
1130         if (ctx->task)
1131                 put_task_struct(ctx->task);
1132 }
1133
1134 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1135 {
1136         struct perf_cpu_context *cpuctx;
1137         struct perf_counter_context *ctx;
1138         struct task_struct *task;
1139
1140         /*
1141          * If cpu is not a wildcard then this is a percpu counter:
1142          */
1143         if (cpu != -1) {
1144                 /* Must be root to operate on a CPU counter: */
1145                 if (!capable(CAP_SYS_ADMIN))
1146                         return ERR_PTR(-EACCES);
1147
1148                 if (cpu < 0 || cpu > num_possible_cpus())
1149                         return ERR_PTR(-EINVAL);
1150
1151                 /*
1152                  * We could be clever and allow to attach a counter to an
1153                  * offline CPU and activate it when the CPU comes up, but
1154                  * that's for later.
1155                  */
1156                 if (!cpu_isset(cpu, cpu_online_map))
1157                         return ERR_PTR(-ENODEV);
1158
1159                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1160                 ctx = &cpuctx->ctx;
1161
1162                 return ctx;
1163         }
1164
1165         rcu_read_lock();
1166         if (!pid)
1167                 task = current;
1168         else
1169                 task = find_task_by_vpid(pid);
1170         if (task)
1171                 get_task_struct(task);
1172         rcu_read_unlock();
1173
1174         if (!task)
1175                 return ERR_PTR(-ESRCH);
1176
1177         ctx = &task->perf_counter_ctx;
1178         ctx->task = task;
1179
1180         /* Reuse ptrace permission checks for now. */
1181         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1182                 put_context(ctx);
1183                 return ERR_PTR(-EACCES);
1184         }
1185
1186         return ctx;
1187 }
1188
1189 static void free_counter_rcu(struct rcu_head *head)
1190 {
1191         struct perf_counter *counter;
1192
1193         counter = container_of(head, struct perf_counter, rcu_head);
1194         kfree(counter);
1195 }
1196
1197 static void perf_pending_sync(struct perf_counter *counter);
1198
1199 static void free_counter(struct perf_counter *counter)
1200 {
1201         perf_pending_sync(counter);
1202
1203         if (counter->destroy)
1204                 counter->destroy(counter);
1205
1206         call_rcu(&counter->rcu_head, free_counter_rcu);
1207 }
1208
1209 /*
1210  * Called when the last reference to the file is gone.
1211  */
1212 static int perf_release(struct inode *inode, struct file *file)
1213 {
1214         struct perf_counter *counter = file->private_data;
1215         struct perf_counter_context *ctx = counter->ctx;
1216
1217         file->private_data = NULL;
1218
1219         mutex_lock(&ctx->mutex);
1220         mutex_lock(&counter->mutex);
1221
1222         perf_counter_remove_from_context(counter);
1223
1224         mutex_unlock(&counter->mutex);
1225         mutex_unlock(&ctx->mutex);
1226
1227         free_counter(counter);
1228         put_context(ctx);
1229
1230         return 0;
1231 }
1232
1233 /*
1234  * Read the performance counter - simple non blocking version for now
1235  */
1236 static ssize_t
1237 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1238 {
1239         u64 values[3];
1240         int n;
1241
1242         /*
1243          * Return end-of-file for a read on a counter that is in
1244          * error state (i.e. because it was pinned but it couldn't be
1245          * scheduled on to the CPU at some point).
1246          */
1247         if (counter->state == PERF_COUNTER_STATE_ERROR)
1248                 return 0;
1249
1250         mutex_lock(&counter->mutex);
1251         values[0] = perf_counter_read(counter);
1252         n = 1;
1253         if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1254                 values[n++] = counter->total_time_enabled +
1255                         atomic64_read(&counter->child_total_time_enabled);
1256         if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1257                 values[n++] = counter->total_time_running +
1258                         atomic64_read(&counter->child_total_time_running);
1259         mutex_unlock(&counter->mutex);
1260
1261         if (count < n * sizeof(u64))
1262                 return -EINVAL;
1263         count = n * sizeof(u64);
1264
1265         if (copy_to_user(buf, values, count))
1266                 return -EFAULT;
1267
1268         return count;
1269 }
1270
1271 static ssize_t
1272 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1273 {
1274         struct perf_counter *counter = file->private_data;
1275
1276         return perf_read_hw(counter, buf, count);
1277 }
1278
1279 static unsigned int perf_poll(struct file *file, poll_table *wait)
1280 {
1281         struct perf_counter *counter = file->private_data;
1282         struct perf_mmap_data *data;
1283         unsigned int events;
1284
1285         rcu_read_lock();
1286         data = rcu_dereference(counter->data);
1287         if (data)
1288                 events = atomic_xchg(&data->wakeup, 0);
1289         else
1290                 events = POLL_HUP;
1291         rcu_read_unlock();
1292
1293         poll_wait(file, &counter->waitq, wait);
1294
1295         return events;
1296 }
1297
1298 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1299 {
1300         struct perf_counter *counter = file->private_data;
1301         int err = 0;
1302
1303         switch (cmd) {
1304         case PERF_COUNTER_IOC_ENABLE:
1305                 perf_counter_enable_family(counter);
1306                 break;
1307         case PERF_COUNTER_IOC_DISABLE:
1308                 perf_counter_disable_family(counter);
1309                 break;
1310         case PERF_COUNTER_IOC_REFRESH:
1311                 perf_counter_refresh(counter, arg);
1312                 break;
1313         default:
1314                 err = -ENOTTY;
1315         }
1316         return err;
1317 }
1318
1319 /*
1320  * Callers need to ensure there can be no nesting of this function, otherwise
1321  * the seqlock logic goes bad. We can not serialize this because the arch
1322  * code calls this from NMI context.
1323  */
1324 void perf_counter_update_userpage(struct perf_counter *counter)
1325 {
1326         struct perf_mmap_data *data;
1327         struct perf_counter_mmap_page *userpg;
1328
1329         rcu_read_lock();
1330         data = rcu_dereference(counter->data);
1331         if (!data)
1332                 goto unlock;
1333
1334         userpg = data->user_page;
1335
1336         /*
1337          * Disable preemption so as to not let the corresponding user-space
1338          * spin too long if we get preempted.
1339          */
1340         preempt_disable();
1341         ++userpg->lock;
1342         barrier();
1343         userpg->index = counter->hw.idx;
1344         userpg->offset = atomic64_read(&counter->count);
1345         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1346                 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1347
1348         barrier();
1349         ++userpg->lock;
1350         preempt_enable();
1351 unlock:
1352         rcu_read_unlock();
1353 }
1354
1355 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1356 {
1357         struct perf_counter *counter = vma->vm_file->private_data;
1358         struct perf_mmap_data *data;
1359         int ret = VM_FAULT_SIGBUS;
1360
1361         rcu_read_lock();
1362         data = rcu_dereference(counter->data);
1363         if (!data)
1364                 goto unlock;
1365
1366         if (vmf->pgoff == 0) {
1367                 vmf->page = virt_to_page(data->user_page);
1368         } else {
1369                 int nr = vmf->pgoff - 1;
1370
1371                 if ((unsigned)nr > data->nr_pages)
1372                         goto unlock;
1373
1374                 vmf->page = virt_to_page(data->data_pages[nr]);
1375         }
1376         get_page(vmf->page);
1377         ret = 0;
1378 unlock:
1379         rcu_read_unlock();
1380
1381         return ret;
1382 }
1383
1384 static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1385 {
1386         struct perf_mmap_data *data;
1387         unsigned long size;
1388         int i;
1389
1390         WARN_ON(atomic_read(&counter->mmap_count));
1391
1392         size = sizeof(struct perf_mmap_data);
1393         size += nr_pages * sizeof(void *);
1394
1395         data = kzalloc(size, GFP_KERNEL);
1396         if (!data)
1397                 goto fail;
1398
1399         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1400         if (!data->user_page)
1401                 goto fail_user_page;
1402
1403         for (i = 0; i < nr_pages; i++) {
1404                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1405                 if (!data->data_pages[i])
1406                         goto fail_data_pages;
1407         }
1408
1409         data->nr_pages = nr_pages;
1410
1411         rcu_assign_pointer(counter->data, data);
1412
1413         return 0;
1414
1415 fail_data_pages:
1416         for (i--; i >= 0; i--)
1417                 free_page((unsigned long)data->data_pages[i]);
1418
1419         free_page((unsigned long)data->user_page);
1420
1421 fail_user_page:
1422         kfree(data);
1423
1424 fail:
1425         return -ENOMEM;
1426 }
1427
1428 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1429 {
1430         struct perf_mmap_data *data = container_of(rcu_head,
1431                         struct perf_mmap_data, rcu_head);
1432         int i;
1433
1434         free_page((unsigned long)data->user_page);
1435         for (i = 0; i < data->nr_pages; i++)
1436                 free_page((unsigned long)data->data_pages[i]);
1437         kfree(data);
1438 }
1439
1440 static void perf_mmap_data_free(struct perf_counter *counter)
1441 {
1442         struct perf_mmap_data *data = counter->data;
1443
1444         WARN_ON(atomic_read(&counter->mmap_count));
1445
1446         rcu_assign_pointer(counter->data, NULL);
1447         call_rcu(&data->rcu_head, __perf_mmap_data_free);
1448 }
1449
1450 static void perf_mmap_open(struct vm_area_struct *vma)
1451 {
1452         struct perf_counter *counter = vma->vm_file->private_data;
1453
1454         atomic_inc(&counter->mmap_count);
1455 }
1456
1457 static void perf_mmap_close(struct vm_area_struct *vma)
1458 {
1459         struct perf_counter *counter = vma->vm_file->private_data;
1460
1461         if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1462                                       &counter->mmap_mutex)) {
1463                 vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
1464                 perf_mmap_data_free(counter);
1465                 mutex_unlock(&counter->mmap_mutex);
1466         }
1467 }
1468
1469 static struct vm_operations_struct perf_mmap_vmops = {
1470         .open  = perf_mmap_open,
1471         .close = perf_mmap_close,
1472         .fault = perf_mmap_fault,
1473 };
1474
1475 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1476 {
1477         struct perf_counter *counter = file->private_data;
1478         unsigned long vma_size;
1479         unsigned long nr_pages;
1480         unsigned long locked, lock_limit;
1481         int ret = 0;
1482
1483         if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1484                 return -EINVAL;
1485
1486         vma_size = vma->vm_end - vma->vm_start;
1487         nr_pages = (vma_size / PAGE_SIZE) - 1;
1488
1489         /*
1490          * If we have data pages ensure they're a power-of-two number, so we
1491          * can do bitmasks instead of modulo.
1492          */
1493         if (nr_pages != 0 && !is_power_of_2(nr_pages))
1494                 return -EINVAL;
1495
1496         if (vma_size != PAGE_SIZE * (1 + nr_pages))
1497                 return -EINVAL;
1498
1499         if (vma->vm_pgoff != 0)
1500                 return -EINVAL;
1501
1502         mutex_lock(&counter->mmap_mutex);
1503         if (atomic_inc_not_zero(&counter->mmap_count)) {
1504                 if (nr_pages != counter->data->nr_pages)
1505                         ret = -EINVAL;
1506                 goto unlock;
1507         }
1508
1509         locked = vma->vm_mm->locked_vm;
1510         locked += nr_pages + 1;
1511
1512         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1513         lock_limit >>= PAGE_SHIFT;
1514
1515         if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1516                 ret = -EPERM;
1517                 goto unlock;
1518         }
1519
1520         WARN_ON(counter->data);
1521         ret = perf_mmap_data_alloc(counter, nr_pages);
1522         if (ret)
1523                 goto unlock;
1524
1525         atomic_set(&counter->mmap_count, 1);
1526         vma->vm_mm->locked_vm += nr_pages + 1;
1527 unlock:
1528         mutex_unlock(&counter->mmap_mutex);
1529
1530         vma->vm_flags &= ~VM_MAYWRITE;
1531         vma->vm_flags |= VM_RESERVED;
1532         vma->vm_ops = &perf_mmap_vmops;
1533
1534         return ret;
1535 }
1536
1537 static int perf_fasync(int fd, struct file *filp, int on)
1538 {
1539         struct perf_counter *counter = filp->private_data;
1540         struct inode *inode = filp->f_path.dentry->d_inode;
1541         int retval;
1542
1543         mutex_lock(&inode->i_mutex);
1544         retval = fasync_helper(fd, filp, on, &counter->fasync);
1545         mutex_unlock(&inode->i_mutex);
1546
1547         if (retval < 0)
1548                 return retval;
1549
1550         return 0;
1551 }
1552
1553 static const struct file_operations perf_fops = {
1554         .release                = perf_release,
1555         .read                   = perf_read,
1556         .poll                   = perf_poll,
1557         .unlocked_ioctl         = perf_ioctl,
1558         .compat_ioctl           = perf_ioctl,
1559         .mmap                   = perf_mmap,
1560         .fasync                 = perf_fasync,
1561 };
1562
1563 /*
1564  * Perf counter wakeup
1565  *
1566  * If there's data, ensure we set the poll() state and publish everything
1567  * to user-space before waking everybody up.
1568  */
1569
1570 void perf_counter_wakeup(struct perf_counter *counter)
1571 {
1572         struct perf_mmap_data *data;
1573
1574         rcu_read_lock();
1575         data = rcu_dereference(counter->data);
1576         if (data) {
1577                 atomic_set(&data->wakeup, POLL_IN);
1578                 /*
1579                  * Ensure all data writes are issued before updating the
1580                  * user-space data head information. The matching rmb()
1581                  * will be in userspace after reading this value.
1582                  */
1583                 smp_wmb();
1584                 data->user_page->data_head = atomic_read(&data->head);
1585         }
1586         rcu_read_unlock();
1587
1588         wake_up_all(&counter->waitq);
1589
1590         if (counter->pending_kill) {
1591                 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1592                 counter->pending_kill = 0;
1593         }
1594 }
1595
1596 /*
1597  * Pending wakeups
1598  *
1599  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1600  *
1601  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1602  * single linked list and use cmpxchg() to add entries lockless.
1603  */
1604
1605 static void perf_pending_counter(struct perf_pending_entry *entry)
1606 {
1607         struct perf_counter *counter = container_of(entry,
1608                         struct perf_counter, pending);
1609
1610         if (counter->pending_disable) {
1611                 counter->pending_disable = 0;
1612                 perf_counter_disable(counter);
1613         }
1614
1615         if (counter->pending_wakeup) {
1616                 counter->pending_wakeup = 0;
1617                 perf_counter_wakeup(counter);
1618         }
1619 }
1620
1621 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1622
1623 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
1624         PENDING_TAIL,
1625 };
1626
1627 static void perf_pending_queue(struct perf_pending_entry *entry,
1628                                void (*func)(struct perf_pending_entry *))
1629 {
1630         struct perf_pending_entry **head;
1631
1632         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
1633                 return;
1634
1635         entry->func = func;
1636
1637         head = &get_cpu_var(perf_pending_head);
1638
1639         do {
1640                 entry->next = *head;
1641         } while (cmpxchg(head, entry->next, entry) != entry->next);
1642
1643         set_perf_counter_pending();
1644
1645         put_cpu_var(perf_pending_head);
1646 }
1647
1648 static int __perf_pending_run(void)
1649 {
1650         struct perf_pending_entry *list;
1651         int nr = 0;
1652
1653         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
1654         while (list != PENDING_TAIL) {
1655                 void (*func)(struct perf_pending_entry *);
1656                 struct perf_pending_entry *entry = list;
1657
1658                 list = list->next;
1659
1660                 func = entry->func;
1661                 entry->next = NULL;
1662                 /*
1663                  * Ensure we observe the unqueue before we issue the wakeup,
1664                  * so that we won't be waiting forever.
1665                  * -- see perf_not_pending().
1666                  */
1667                 smp_wmb();
1668
1669                 func(entry);
1670                 nr++;
1671         }
1672
1673         return nr;
1674 }
1675
1676 static inline int perf_not_pending(struct perf_counter *counter)
1677 {
1678         /*
1679          * If we flush on whatever cpu we run, there is a chance we don't
1680          * need to wait.
1681          */
1682         get_cpu();
1683         __perf_pending_run();
1684         put_cpu();
1685
1686         /*
1687          * Ensure we see the proper queue state before going to sleep
1688          * so that we do not miss the wakeup. -- see perf_pending_handle()
1689          */
1690         smp_rmb();
1691         return counter->pending.next == NULL;
1692 }
1693
1694 static void perf_pending_sync(struct perf_counter *counter)
1695 {
1696         wait_event(counter->waitq, perf_not_pending(counter));
1697 }
1698
1699 void perf_counter_do_pending(void)
1700 {
1701         __perf_pending_run();
1702 }
1703
1704 /*
1705  * Callchain support -- arch specific
1706  */
1707
1708 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1709 {
1710         return NULL;
1711 }
1712
1713 /*
1714  * Output
1715  */
1716
1717 struct perf_output_handle {
1718         struct perf_counter     *counter;
1719         struct perf_mmap_data   *data;
1720         unsigned int            offset;
1721         unsigned int            head;
1722         int                     wakeup;
1723         int                     nmi;
1724         int                     overflow;
1725 };
1726
1727 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1728 {
1729         if (handle->nmi) {
1730                 handle->counter->pending_wakeup = 1;
1731                 perf_pending_queue(&handle->counter->pending,
1732                                    perf_pending_counter);
1733         } else
1734                 perf_counter_wakeup(handle->counter);
1735 }
1736
1737 static int perf_output_begin(struct perf_output_handle *handle,
1738                              struct perf_counter *counter, unsigned int size,
1739                              int nmi, int overflow)
1740 {
1741         struct perf_mmap_data *data;
1742         unsigned int offset, head;
1743
1744         rcu_read_lock();
1745         data = rcu_dereference(counter->data);
1746         if (!data)
1747                 goto out;
1748
1749         handle->counter  = counter;
1750         handle->nmi      = nmi;
1751         handle->overflow = overflow;
1752
1753         if (!data->nr_pages)
1754                 goto fail;
1755
1756         do {
1757                 offset = head = atomic_read(&data->head);
1758                 head += size;
1759         } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1760
1761         handle->data    = data;
1762         handle->offset  = offset;
1763         handle->head    = head;
1764         handle->wakeup  = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
1765
1766         return 0;
1767
1768 fail:
1769         __perf_output_wakeup(handle);
1770 out:
1771         rcu_read_unlock();
1772
1773         return -ENOSPC;
1774 }
1775
1776 static void perf_output_copy(struct perf_output_handle *handle,
1777                              void *buf, unsigned int len)
1778 {
1779         unsigned int pages_mask;
1780         unsigned int offset;
1781         unsigned int size;
1782         void **pages;
1783
1784         offset          = handle->offset;
1785         pages_mask      = handle->data->nr_pages - 1;
1786         pages           = handle->data->data_pages;
1787
1788         do {
1789                 unsigned int page_offset;
1790                 int nr;
1791
1792                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
1793                 page_offset = offset & (PAGE_SIZE - 1);
1794                 size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1795
1796                 memcpy(pages[nr] + page_offset, buf, size);
1797
1798                 len         -= size;
1799                 buf         += size;
1800                 offset      += size;
1801         } while (len);
1802
1803         handle->offset = offset;
1804
1805         WARN_ON_ONCE(handle->offset > handle->head);
1806 }
1807
1808 #define perf_output_put(handle, x) \
1809         perf_output_copy((handle), &(x), sizeof(x))
1810
1811 static void perf_output_end(struct perf_output_handle *handle)
1812 {
1813         int wakeup_events = handle->counter->hw_event.wakeup_events;
1814
1815         if (handle->overflow && wakeup_events) {
1816                 int events = atomic_inc_return(&handle->data->events);
1817                 if (events >= wakeup_events) {
1818                         atomic_sub(wakeup_events, &handle->data->events);
1819                         __perf_output_wakeup(handle);
1820                 }
1821         } else if (handle->wakeup)
1822                 __perf_output_wakeup(handle);
1823         rcu_read_unlock();
1824 }
1825
1826 static void perf_counter_output(struct perf_counter *counter,
1827                                 int nmi, struct pt_regs *regs)
1828 {
1829         int ret;
1830         u64 record_type = counter->hw_event.record_type;
1831         struct perf_output_handle handle;
1832         struct perf_event_header header;
1833         u64 ip;
1834         struct {
1835                 u32 pid, tid;
1836         } tid_entry;
1837         struct {
1838                 u64 event;
1839                 u64 counter;
1840         } group_entry;
1841         struct perf_callchain_entry *callchain = NULL;
1842         int callchain_size = 0;
1843         u64 time;
1844
1845         header.type = PERF_EVENT_COUNTER_OVERFLOW;
1846         header.size = sizeof(header);
1847
1848         if (record_type & PERF_RECORD_IP) {
1849                 ip = instruction_pointer(regs);
1850                 header.type |= __PERF_EVENT_IP;
1851                 header.size += sizeof(ip);
1852         }
1853
1854         if (record_type & PERF_RECORD_TID) {
1855                 /* namespace issues */
1856                 tid_entry.pid = current->group_leader->pid;
1857                 tid_entry.tid = current->pid;
1858
1859                 header.type |= __PERF_EVENT_TID;
1860                 header.size += sizeof(tid_entry);
1861         }
1862
1863         if (record_type & PERF_RECORD_GROUP) {
1864                 header.type |= __PERF_EVENT_GROUP;
1865                 header.size += sizeof(u64) +
1866                         counter->nr_siblings * sizeof(group_entry);
1867         }
1868
1869         if (record_type & PERF_RECORD_CALLCHAIN) {
1870                 callchain = perf_callchain(regs);
1871
1872                 if (callchain) {
1873                         callchain_size = (1 + callchain->nr) * sizeof(u64);
1874
1875                         header.type |= __PERF_EVENT_CALLCHAIN;
1876                         header.size += callchain_size;
1877                 }
1878         }
1879
1880         if (record_type & PERF_RECORD_TIME) {
1881                 /*
1882                  * Maybe do better on x86 and provide cpu_clock_nmi()
1883                  */
1884                 time = sched_clock();
1885
1886                 header.type |= __PERF_EVENT_TIME;
1887                 header.size += sizeof(u64);
1888         }
1889
1890         ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
1891         if (ret)
1892                 return;
1893
1894         perf_output_put(&handle, header);
1895
1896         if (record_type & PERF_RECORD_IP)
1897                 perf_output_put(&handle, ip);
1898
1899         if (record_type & PERF_RECORD_TID)
1900                 perf_output_put(&handle, tid_entry);
1901
1902         if (record_type & PERF_RECORD_GROUP) {
1903                 struct perf_counter *leader, *sub;
1904                 u64 nr = counter->nr_siblings;
1905
1906                 perf_output_put(&handle, nr);
1907
1908                 leader = counter->group_leader;
1909                 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1910                         if (sub != counter)
1911                                 sub->hw_ops->read(sub);
1912
1913                         group_entry.event = sub->hw_event.config;
1914                         group_entry.counter = atomic64_read(&sub->count);
1915
1916                         perf_output_put(&handle, group_entry);
1917                 }
1918         }
1919
1920         if (callchain)
1921                 perf_output_copy(&handle, callchain, callchain_size);
1922
1923         if (record_type & PERF_RECORD_TIME)
1924                 perf_output_put(&handle, time);
1925
1926         perf_output_end(&handle);
1927 }
1928
1929 /*
1930  * mmap tracking
1931  */
1932
1933 struct perf_mmap_event {
1934         struct file     *file;
1935         char            *file_name;
1936         int             file_size;
1937
1938         struct {
1939                 struct perf_event_header        header;
1940
1941                 u32                             pid;
1942                 u32                             tid;
1943                 u64                             start;
1944                 u64                             len;
1945                 u64                             pgoff;
1946         } event;
1947 };
1948
1949 static void perf_counter_mmap_output(struct perf_counter *counter,
1950                                      struct perf_mmap_event *mmap_event)
1951 {
1952         struct perf_output_handle handle;
1953         int size = mmap_event->event.header.size;
1954         int ret = perf_output_begin(&handle, counter, size, 0, 0);
1955
1956         if (ret)
1957                 return;
1958
1959         perf_output_put(&handle, mmap_event->event);
1960         perf_output_copy(&handle, mmap_event->file_name,
1961                                    mmap_event->file_size);
1962         perf_output_end(&handle);
1963 }
1964
1965 static int perf_counter_mmap_match(struct perf_counter *counter,
1966                                    struct perf_mmap_event *mmap_event)
1967 {
1968         if (counter->hw_event.mmap &&
1969             mmap_event->event.header.type == PERF_EVENT_MMAP)
1970                 return 1;
1971
1972         if (counter->hw_event.munmap &&
1973             mmap_event->event.header.type == PERF_EVENT_MUNMAP)
1974                 return 1;
1975
1976         return 0;
1977 }
1978
1979 static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
1980                                   struct perf_mmap_event *mmap_event)
1981 {
1982         struct perf_counter *counter;
1983
1984         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1985                 return;
1986
1987         rcu_read_lock();
1988         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1989                 if (perf_counter_mmap_match(counter, mmap_event))
1990                         perf_counter_mmap_output(counter, mmap_event);
1991         }
1992         rcu_read_unlock();
1993 }
1994
1995 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
1996 {
1997         struct perf_cpu_context *cpuctx;
1998         struct file *file = mmap_event->file;
1999         unsigned int size;
2000         char tmp[16];
2001         char *buf = NULL;
2002         char *name;
2003
2004         if (file) {
2005                 buf = kzalloc(PATH_MAX, GFP_KERNEL);
2006                 if (!buf) {
2007                         name = strncpy(tmp, "//enomem", sizeof(tmp));
2008                         goto got_name;
2009                 }
2010                 name = dentry_path(file->f_dentry, buf, PATH_MAX);
2011                 if (IS_ERR(name)) {
2012                         name = strncpy(tmp, "//toolong", sizeof(tmp));
2013                         goto got_name;
2014                 }
2015         } else {
2016                 name = strncpy(tmp, "//anon", sizeof(tmp));
2017                 goto got_name;
2018         }
2019
2020 got_name:
2021         size = ALIGN(strlen(name), sizeof(u64));
2022
2023         mmap_event->file_name = name;
2024         mmap_event->file_size = size;
2025
2026         mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2027
2028         cpuctx = &get_cpu_var(perf_cpu_context);
2029         perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2030         put_cpu_var(perf_cpu_context);
2031
2032         perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2033
2034         kfree(buf);
2035 }
2036
2037 void perf_counter_mmap(unsigned long addr, unsigned long len,
2038                        unsigned long pgoff, struct file *file)
2039 {
2040         struct perf_mmap_event mmap_event = {
2041                 .file   = file,
2042                 .event  = {
2043                         .header = { .type = PERF_EVENT_MMAP, },
2044                         .pid    = current->group_leader->pid,
2045                         .tid    = current->pid,
2046                         .start  = addr,
2047                         .len    = len,
2048                         .pgoff  = pgoff,
2049                 },
2050         };
2051
2052         perf_counter_mmap_event(&mmap_event);
2053 }
2054
2055 void perf_counter_munmap(unsigned long addr, unsigned long len,
2056                          unsigned long pgoff, struct file *file)
2057 {
2058         struct perf_mmap_event mmap_event = {
2059                 .file   = file,
2060                 .event  = {
2061                         .header = { .type = PERF_EVENT_MUNMAP, },
2062                         .pid    = current->group_leader->pid,
2063                         .tid    = current->pid,
2064                         .start  = addr,
2065                         .len    = len,
2066                         .pgoff  = pgoff,
2067                 },
2068         };
2069
2070         perf_counter_mmap_event(&mmap_event);
2071 }
2072
2073 /*
2074  * Generic counter overflow handling.
2075  */
2076
2077 int perf_counter_overflow(struct perf_counter *counter,
2078                           int nmi, struct pt_regs *regs)
2079 {
2080         int events = atomic_read(&counter->event_limit);
2081         int ret = 0;
2082
2083         counter->pending_kill = POLL_IN;
2084         if (events && atomic_dec_and_test(&counter->event_limit)) {
2085                 ret = 1;
2086                 counter->pending_kill = POLL_HUP;
2087                 if (nmi) {
2088                         counter->pending_disable = 1;
2089                         perf_pending_queue(&counter->pending,
2090                                            perf_pending_counter);
2091                 } else
2092                         perf_counter_disable(counter);
2093         }
2094
2095         perf_counter_output(counter, nmi, regs);
2096         return ret;
2097 }
2098
2099 /*
2100  * Generic software counter infrastructure
2101  */
2102
2103 static void perf_swcounter_update(struct perf_counter *counter)
2104 {
2105         struct hw_perf_counter *hwc = &counter->hw;
2106         u64 prev, now;
2107         s64 delta;
2108
2109 again:
2110         prev = atomic64_read(&hwc->prev_count);
2111         now = atomic64_read(&hwc->count);
2112         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2113                 goto again;
2114
2115         delta = now - prev;
2116
2117         atomic64_add(delta, &counter->count);
2118         atomic64_sub(delta, &hwc->period_left);
2119 }
2120
2121 static void perf_swcounter_set_period(struct perf_counter *counter)
2122 {
2123         struct hw_perf_counter *hwc = &counter->hw;
2124         s64 left = atomic64_read(&hwc->period_left);
2125         s64 period = hwc->irq_period;
2126
2127         if (unlikely(left <= -period)) {
2128                 left = period;
2129                 atomic64_set(&hwc->period_left, left);
2130         }
2131
2132         if (unlikely(left <= 0)) {
2133                 left += period;
2134                 atomic64_add(period, &hwc->period_left);
2135         }
2136
2137         atomic64_set(&hwc->prev_count, -left);
2138         atomic64_set(&hwc->count, -left);
2139 }
2140
2141 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2142 {
2143         enum hrtimer_restart ret = HRTIMER_RESTART;
2144         struct perf_counter *counter;
2145         struct pt_regs *regs;
2146
2147         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2148         counter->hw_ops->read(counter);
2149
2150         regs = get_irq_regs();
2151         /*
2152          * In case we exclude kernel IPs or are somehow not in interrupt
2153          * context, provide the next best thing, the user IP.
2154          */
2155         if ((counter->hw_event.exclude_kernel || !regs) &&
2156                         !counter->hw_event.exclude_user)
2157                 regs = task_pt_regs(current);
2158
2159         if (regs) {
2160                 if (perf_counter_overflow(counter, 0, regs))
2161                         ret = HRTIMER_NORESTART;
2162         }
2163
2164         hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2165
2166         return ret;
2167 }
2168
2169 static void perf_swcounter_overflow(struct perf_counter *counter,
2170                                     int nmi, struct pt_regs *regs)
2171 {
2172         perf_swcounter_update(counter);
2173         perf_swcounter_set_period(counter);
2174         if (perf_counter_overflow(counter, nmi, regs))
2175                 /* soft-disable the counter */
2176                 ;
2177
2178 }
2179
2180 static int perf_swcounter_match(struct perf_counter *counter,
2181                                 enum perf_event_types type,
2182                                 u32 event, struct pt_regs *regs)
2183 {
2184         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2185                 return 0;
2186
2187         if (perf_event_raw(&counter->hw_event))
2188                 return 0;
2189
2190         if (perf_event_type(&counter->hw_event) != type)
2191                 return 0;
2192
2193         if (perf_event_id(&counter->hw_event) != event)
2194                 return 0;
2195
2196         if (counter->hw_event.exclude_user && user_mode(regs))
2197                 return 0;
2198
2199         if (counter->hw_event.exclude_kernel && !user_mode(regs))
2200                 return 0;
2201
2202         return 1;
2203 }
2204
2205 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
2206                                int nmi, struct pt_regs *regs)
2207 {
2208         int neg = atomic64_add_negative(nr, &counter->hw.count);
2209         if (counter->hw.irq_period && !neg)
2210                 perf_swcounter_overflow(counter, nmi, regs);
2211 }
2212
2213 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
2214                                      enum perf_event_types type, u32 event,
2215                                      u64 nr, int nmi, struct pt_regs *regs)
2216 {
2217         struct perf_counter *counter;
2218
2219         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2220                 return;
2221
2222         rcu_read_lock();
2223         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2224                 if (perf_swcounter_match(counter, type, event, regs))
2225                         perf_swcounter_add(counter, nr, nmi, regs);
2226         }
2227         rcu_read_unlock();
2228 }
2229
2230 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2231 {
2232         if (in_nmi())
2233                 return &cpuctx->recursion[3];
2234
2235         if (in_irq())
2236                 return &cpuctx->recursion[2];
2237
2238         if (in_softirq())
2239                 return &cpuctx->recursion[1];
2240
2241         return &cpuctx->recursion[0];
2242 }
2243
2244 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
2245                                    u64 nr, int nmi, struct pt_regs *regs)
2246 {
2247         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
2248         int *recursion = perf_swcounter_recursion_context(cpuctx);
2249
2250         if (*recursion)
2251                 goto out;
2252
2253         (*recursion)++;
2254         barrier();
2255
2256         perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
2257         if (cpuctx->task_ctx) {
2258                 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
2259                                 nr, nmi, regs);
2260         }
2261
2262         barrier();
2263         (*recursion)--;
2264
2265 out:
2266         put_cpu_var(perf_cpu_context);
2267 }
2268
2269 void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
2270 {
2271         __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
2272 }
2273
2274 static void perf_swcounter_read(struct perf_counter *counter)
2275 {
2276         perf_swcounter_update(counter);
2277 }
2278
2279 static int perf_swcounter_enable(struct perf_counter *counter)
2280 {
2281         perf_swcounter_set_period(counter);
2282         return 0;
2283 }
2284
2285 static void perf_swcounter_disable(struct perf_counter *counter)
2286 {
2287         perf_swcounter_update(counter);
2288 }
2289
2290 static const struct hw_perf_counter_ops perf_ops_generic = {
2291         .enable         = perf_swcounter_enable,
2292         .disable        = perf_swcounter_disable,
2293         .read           = perf_swcounter_read,
2294 };
2295
2296 /*
2297  * Software counter: cpu wall time clock
2298  */
2299
2300 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2301 {
2302         int cpu = raw_smp_processor_id();
2303         s64 prev;
2304         u64 now;
2305
2306         now = cpu_clock(cpu);
2307         prev = atomic64_read(&counter->hw.prev_count);
2308         atomic64_set(&counter->hw.prev_count, now);
2309         atomic64_add(now - prev, &counter->count);
2310 }
2311
2312 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2313 {
2314         struct hw_perf_counter *hwc = &counter->hw;
2315         int cpu = raw_smp_processor_id();
2316
2317         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
2318         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2319         hwc->hrtimer.function = perf_swcounter_hrtimer;
2320         if (hwc->irq_period) {
2321                 __hrtimer_start_range_ns(&hwc->hrtimer,
2322                                 ns_to_ktime(hwc->irq_period), 0,
2323                                 HRTIMER_MODE_REL, 0);
2324         }
2325
2326         return 0;
2327 }
2328
2329 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2330 {
2331         hrtimer_cancel(&counter->hw.hrtimer);
2332         cpu_clock_perf_counter_update(counter);
2333 }
2334
2335 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2336 {
2337         cpu_clock_perf_counter_update(counter);
2338 }
2339
2340 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
2341         .enable         = cpu_clock_perf_counter_enable,
2342         .disable        = cpu_clock_perf_counter_disable,
2343         .read           = cpu_clock_perf_counter_read,
2344 };
2345
2346 /*
2347  * Software counter: task time clock
2348  */
2349
2350 /*
2351  * Called from within the scheduler:
2352  */
2353 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
2354 {
2355         struct task_struct *curr = counter->task;
2356         u64 delta;
2357
2358         delta = __task_delta_exec(curr, update);
2359
2360         return curr->se.sum_exec_runtime + delta;
2361 }
2362
2363 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
2364 {
2365         u64 prev;
2366         s64 delta;
2367
2368         prev = atomic64_read(&counter->hw.prev_count);
2369
2370         atomic64_set(&counter->hw.prev_count, now);
2371
2372         delta = now - prev;
2373
2374         atomic64_add(delta, &counter->count);
2375 }
2376
2377 static int task_clock_perf_counter_enable(struct perf_counter *counter)
2378 {
2379         struct hw_perf_counter *hwc = &counter->hw;
2380
2381         atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
2382         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2383         hwc->hrtimer.function = perf_swcounter_hrtimer;
2384         if (hwc->irq_period) {
2385                 __hrtimer_start_range_ns(&hwc->hrtimer,
2386                                 ns_to_ktime(hwc->irq_period), 0,
2387                                 HRTIMER_MODE_REL, 0);
2388         }
2389
2390         return 0;
2391 }
2392
2393 static void task_clock_perf_counter_disable(struct perf_counter *counter)
2394 {
2395         hrtimer_cancel(&counter->hw.hrtimer);
2396         task_clock_perf_counter_update(counter,
2397                         task_clock_perf_counter_val(counter, 0));
2398 }
2399
2400 static void task_clock_perf_counter_read(struct perf_counter *counter)
2401 {
2402         task_clock_perf_counter_update(counter,
2403                         task_clock_perf_counter_val(counter, 1));
2404 }
2405
2406 static const struct hw_perf_counter_ops perf_ops_task_clock = {
2407         .enable         = task_clock_perf_counter_enable,
2408         .disable        = task_clock_perf_counter_disable,
2409         .read           = task_clock_perf_counter_read,
2410 };
2411
2412 /*
2413  * Software counter: cpu migrations
2414  */
2415
2416 static inline u64 get_cpu_migrations(struct perf_counter *counter)
2417 {
2418         struct task_struct *curr = counter->ctx->task;
2419
2420         if (curr)
2421                 return curr->se.nr_migrations;
2422         return cpu_nr_migrations(smp_processor_id());
2423 }
2424
2425 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2426 {
2427         u64 prev, now;
2428         s64 delta;
2429
2430         prev = atomic64_read(&counter->hw.prev_count);
2431         now = get_cpu_migrations(counter);
2432
2433         atomic64_set(&counter->hw.prev_count, now);
2434
2435         delta = now - prev;
2436
2437         atomic64_add(delta, &counter->count);
2438 }
2439
2440 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2441 {
2442         cpu_migrations_perf_counter_update(counter);
2443 }
2444
2445 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
2446 {
2447         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2448                 atomic64_set(&counter->hw.prev_count,
2449                              get_cpu_migrations(counter));
2450         return 0;
2451 }
2452
2453 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2454 {
2455         cpu_migrations_perf_counter_update(counter);
2456 }
2457
2458 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
2459         .enable         = cpu_migrations_perf_counter_enable,
2460         .disable        = cpu_migrations_perf_counter_disable,
2461         .read           = cpu_migrations_perf_counter_read,
2462 };
2463
2464 #ifdef CONFIG_EVENT_PROFILE
2465 void perf_tpcounter_event(int event_id)
2466 {
2467         struct pt_regs *regs = get_irq_regs();
2468
2469         if (!regs)
2470                 regs = task_pt_regs(current);
2471
2472         __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
2473 }
2474
2475 extern int ftrace_profile_enable(int);
2476 extern void ftrace_profile_disable(int);
2477
2478 static void tp_perf_counter_destroy(struct perf_counter *counter)
2479 {
2480         ftrace_profile_disable(perf_event_id(&counter->hw_event));
2481 }
2482
2483 static const struct hw_perf_counter_ops *
2484 tp_perf_counter_init(struct perf_counter *counter)
2485 {
2486         int event_id = perf_event_id(&counter->hw_event);
2487         int ret;
2488
2489         ret = ftrace_profile_enable(event_id);
2490         if (ret)
2491                 return NULL;
2492
2493         counter->destroy = tp_perf_counter_destroy;
2494         counter->hw.irq_period = counter->hw_event.irq_period;
2495
2496         return &perf_ops_generic;
2497 }
2498 #else
2499 static const struct hw_perf_counter_ops *
2500 tp_perf_counter_init(struct perf_counter *counter)
2501 {
2502         return NULL;
2503 }
2504 #endif
2505
2506 static const struct hw_perf_counter_ops *
2507 sw_perf_counter_init(struct perf_counter *counter)
2508 {
2509         struct perf_counter_hw_event *hw_event = &counter->hw_event;
2510         const struct hw_perf_counter_ops *hw_ops = NULL;
2511         struct hw_perf_counter *hwc = &counter->hw;
2512
2513         /*
2514          * Software counters (currently) can't in general distinguish
2515          * between user, kernel and hypervisor events.
2516          * However, context switches and cpu migrations are considered
2517          * to be kernel events, and page faults are never hypervisor
2518          * events.
2519          */
2520         switch (perf_event_id(&counter->hw_event)) {
2521         case PERF_COUNT_CPU_CLOCK:
2522                 hw_ops = &perf_ops_cpu_clock;
2523
2524                 if (hw_event->irq_period && hw_event->irq_period < 10000)
2525                         hw_event->irq_period = 10000;
2526                 break;
2527         case PERF_COUNT_TASK_CLOCK:
2528                 /*
2529                  * If the user instantiates this as a per-cpu counter,
2530                  * use the cpu_clock counter instead.
2531                  */
2532                 if (counter->ctx->task)
2533                         hw_ops = &perf_ops_task_clock;
2534                 else
2535                         hw_ops = &perf_ops_cpu_clock;
2536
2537                 if (hw_event->irq_period && hw_event->irq_period < 10000)
2538                         hw_event->irq_period = 10000;
2539                 break;
2540         case PERF_COUNT_PAGE_FAULTS:
2541         case PERF_COUNT_PAGE_FAULTS_MIN:
2542         case PERF_COUNT_PAGE_FAULTS_MAJ:
2543         case PERF_COUNT_CONTEXT_SWITCHES:
2544                 hw_ops = &perf_ops_generic;
2545                 break;
2546         case PERF_COUNT_CPU_MIGRATIONS:
2547                 if (!counter->hw_event.exclude_kernel)
2548                         hw_ops = &perf_ops_cpu_migrations;
2549                 break;
2550         }
2551
2552         if (hw_ops)
2553                 hwc->irq_period = hw_event->irq_period;
2554
2555         return hw_ops;
2556 }
2557
2558 /*
2559  * Allocate and initialize a counter structure
2560  */
2561 static struct perf_counter *
2562 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2563                    int cpu,
2564                    struct perf_counter_context *ctx,
2565                    struct perf_counter *group_leader,
2566                    gfp_t gfpflags)
2567 {
2568         const struct hw_perf_counter_ops *hw_ops;
2569         struct perf_counter *counter;
2570         long err;
2571
2572         counter = kzalloc(sizeof(*counter), gfpflags);
2573         if (!counter)
2574                 return ERR_PTR(-ENOMEM);
2575
2576         /*
2577          * Single counters are their own group leaders, with an
2578          * empty sibling list:
2579          */
2580         if (!group_leader)
2581                 group_leader = counter;
2582
2583         mutex_init(&counter->mutex);
2584         INIT_LIST_HEAD(&counter->list_entry);
2585         INIT_LIST_HEAD(&counter->event_entry);
2586         INIT_LIST_HEAD(&counter->sibling_list);
2587         init_waitqueue_head(&counter->waitq);
2588
2589         mutex_init(&counter->mmap_mutex);
2590
2591         INIT_LIST_HEAD(&counter->child_list);
2592
2593         counter->cpu                    = cpu;
2594         counter->hw_event               = *hw_event;
2595         counter->group_leader           = group_leader;
2596         counter->hw_ops                 = NULL;
2597         counter->ctx                    = ctx;
2598
2599         counter->state = PERF_COUNTER_STATE_INACTIVE;
2600         if (hw_event->disabled)
2601                 counter->state = PERF_COUNTER_STATE_OFF;
2602
2603         hw_ops = NULL;
2604
2605         if (perf_event_raw(hw_event)) {
2606                 hw_ops = hw_perf_counter_init(counter);
2607                 goto done;
2608         }
2609
2610         switch (perf_event_type(hw_event)) {
2611         case PERF_TYPE_HARDWARE:
2612                 hw_ops = hw_perf_counter_init(counter);
2613                 break;
2614
2615         case PERF_TYPE_SOFTWARE:
2616                 hw_ops = sw_perf_counter_init(counter);
2617                 break;
2618
2619         case PERF_TYPE_TRACEPOINT:
2620                 hw_ops = tp_perf_counter_init(counter);
2621                 break;
2622         }
2623 done:
2624         err = 0;
2625         if (!hw_ops)
2626                 err = -EINVAL;
2627         else if (IS_ERR(hw_ops))
2628                 err = PTR_ERR(hw_ops);
2629
2630         if (err) {
2631                 kfree(counter);
2632                 return ERR_PTR(err);
2633         }
2634
2635         counter->hw_ops = hw_ops;
2636
2637         return counter;
2638 }
2639
2640 /**
2641  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2642  *
2643  * @hw_event_uptr:      event type attributes for monitoring/sampling
2644  * @pid:                target pid
2645  * @cpu:                target cpu
2646  * @group_fd:           group leader counter fd
2647  */
2648 SYSCALL_DEFINE5(perf_counter_open,
2649                 const struct perf_counter_hw_event __user *, hw_event_uptr,
2650                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
2651 {
2652         struct perf_counter *counter, *group_leader;
2653         struct perf_counter_hw_event hw_event;
2654         struct perf_counter_context *ctx;
2655         struct file *counter_file = NULL;
2656         struct file *group_file = NULL;
2657         int fput_needed = 0;
2658         int fput_needed2 = 0;
2659         int ret;
2660
2661         /* for future expandability... */
2662         if (flags)
2663                 return -EINVAL;
2664
2665         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
2666                 return -EFAULT;
2667
2668         /*
2669          * Get the target context (task or percpu):
2670          */
2671         ctx = find_get_context(pid, cpu);
2672         if (IS_ERR(ctx))
2673                 return PTR_ERR(ctx);
2674
2675         /*
2676          * Look up the group leader (we will attach this counter to it):
2677          */
2678         group_leader = NULL;
2679         if (group_fd != -1) {
2680                 ret = -EINVAL;
2681                 group_file = fget_light(group_fd, &fput_needed);
2682                 if (!group_file)
2683                         goto err_put_context;
2684                 if (group_file->f_op != &perf_fops)
2685                         goto err_put_context;
2686
2687                 group_leader = group_file->private_data;
2688                 /*
2689                  * Do not allow a recursive hierarchy (this new sibling
2690                  * becoming part of another group-sibling):
2691                  */
2692                 if (group_leader->group_leader != group_leader)
2693                         goto err_put_context;
2694                 /*
2695                  * Do not allow to attach to a group in a different
2696                  * task or CPU context:
2697                  */
2698                 if (group_leader->ctx != ctx)
2699                         goto err_put_context;
2700                 /*
2701                  * Only a group leader can be exclusive or pinned
2702                  */
2703                 if (hw_event.exclusive || hw_event.pinned)
2704                         goto err_put_context;
2705         }
2706
2707         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2708                                      GFP_KERNEL);
2709         ret = PTR_ERR(counter);
2710         if (IS_ERR(counter))
2711                 goto err_put_context;
2712
2713         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2714         if (ret < 0)
2715                 goto err_free_put_context;
2716
2717         counter_file = fget_light(ret, &fput_needed2);
2718         if (!counter_file)
2719                 goto err_free_put_context;
2720
2721         counter->filp = counter_file;
2722         mutex_lock(&ctx->mutex);
2723         perf_install_in_context(ctx, counter, cpu);
2724         mutex_unlock(&ctx->mutex);
2725
2726         fput_light(counter_file, fput_needed2);
2727
2728 out_fput:
2729         fput_light(group_file, fput_needed);
2730
2731         return ret;
2732
2733 err_free_put_context:
2734         kfree(counter);
2735
2736 err_put_context:
2737         put_context(ctx);
2738
2739         goto out_fput;
2740 }
2741
2742 /*
2743  * Initialize the perf_counter context in a task_struct:
2744  */
2745 static void
2746 __perf_counter_init_context(struct perf_counter_context *ctx,
2747                             struct task_struct *task)
2748 {
2749         memset(ctx, 0, sizeof(*ctx));
2750         spin_lock_init(&ctx->lock);
2751         mutex_init(&ctx->mutex);
2752         INIT_LIST_HEAD(&ctx->counter_list);
2753         INIT_LIST_HEAD(&ctx->event_list);
2754         ctx->task = task;
2755 }
2756
2757 /*
2758  * inherit a counter from parent task to child task:
2759  */
2760 static struct perf_counter *
2761 inherit_counter(struct perf_counter *parent_counter,
2762               struct task_struct *parent,
2763               struct perf_counter_context *parent_ctx,
2764               struct task_struct *child,
2765               struct perf_counter *group_leader,
2766               struct perf_counter_context *child_ctx)
2767 {
2768         struct perf_counter *child_counter;
2769
2770         /*
2771          * Instead of creating recursive hierarchies of counters,
2772          * we link inherited counters back to the original parent,
2773          * which has a filp for sure, which we use as the reference
2774          * count:
2775          */
2776         if (parent_counter->parent)
2777                 parent_counter = parent_counter->parent;
2778
2779         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2780                                            parent_counter->cpu, child_ctx,
2781                                            group_leader, GFP_KERNEL);
2782         if (IS_ERR(child_counter))
2783                 return child_counter;
2784
2785         /*
2786          * Link it up in the child's context:
2787          */
2788         child_counter->task = child;
2789         add_counter_to_ctx(child_counter, child_ctx);
2790
2791         child_counter->parent = parent_counter;
2792         /*
2793          * inherit into child's child as well:
2794          */
2795         child_counter->hw_event.inherit = 1;
2796
2797         /*
2798          * Get a reference to the parent filp - we will fput it
2799          * when the child counter exits. This is safe to do because
2800          * we are in the parent and we know that the filp still
2801          * exists and has a nonzero count:
2802          */
2803         atomic_long_inc(&parent_counter->filp->f_count);
2804
2805         /*
2806          * Link this into the parent counter's child list
2807          */
2808         mutex_lock(&parent_counter->mutex);
2809         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2810
2811         /*
2812          * Make the child state follow the state of the parent counter,
2813          * not its hw_event.disabled bit.  We hold the parent's mutex,
2814          * so we won't race with perf_counter_{en,dis}able_family.
2815          */
2816         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2817                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2818         else
2819                 child_counter->state = PERF_COUNTER_STATE_OFF;
2820
2821         mutex_unlock(&parent_counter->mutex);
2822
2823         return child_counter;
2824 }
2825
2826 static int inherit_group(struct perf_counter *parent_counter,
2827               struct task_struct *parent,
2828               struct perf_counter_context *parent_ctx,
2829               struct task_struct *child,
2830               struct perf_counter_context *child_ctx)
2831 {
2832         struct perf_counter *leader;
2833         struct perf_counter *sub;
2834         struct perf_counter *child_ctr;
2835
2836         leader = inherit_counter(parent_counter, parent, parent_ctx,
2837                                  child, NULL, child_ctx);
2838         if (IS_ERR(leader))
2839                 return PTR_ERR(leader);
2840         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2841                 child_ctr = inherit_counter(sub, parent, parent_ctx,
2842                                             child, leader, child_ctx);
2843                 if (IS_ERR(child_ctr))
2844                         return PTR_ERR(child_ctr);
2845         }
2846         return 0;
2847 }
2848
2849 static void sync_child_counter(struct perf_counter *child_counter,
2850                                struct perf_counter *parent_counter)
2851 {
2852         u64 parent_val, child_val;
2853
2854         parent_val = atomic64_read(&parent_counter->count);
2855         child_val = atomic64_read(&child_counter->count);
2856
2857         /*
2858          * Add back the child's count to the parent's count:
2859          */
2860         atomic64_add(child_val, &parent_counter->count);
2861         atomic64_add(child_counter->total_time_enabled,
2862                      &parent_counter->child_total_time_enabled);
2863         atomic64_add(child_counter->total_time_running,
2864                      &parent_counter->child_total_time_running);
2865
2866         /*
2867          * Remove this counter from the parent's list
2868          */
2869         mutex_lock(&parent_counter->mutex);
2870         list_del_init(&child_counter->child_list);
2871         mutex_unlock(&parent_counter->mutex);
2872
2873         /*
2874          * Release the parent counter, if this was the last
2875          * reference to it.
2876          */
2877         fput(parent_counter->filp);
2878 }
2879
2880 static void
2881 __perf_counter_exit_task(struct task_struct *child,
2882                          struct perf_counter *child_counter,
2883                          struct perf_counter_context *child_ctx)
2884 {
2885         struct perf_counter *parent_counter;
2886         struct perf_counter *sub, *tmp;
2887
2888         /*
2889          * If we do not self-reap then we have to wait for the
2890          * child task to unschedule (it will happen for sure),
2891          * so that its counter is at its final count. (This
2892          * condition triggers rarely - child tasks usually get
2893          * off their CPU before the parent has a chance to
2894          * get this far into the reaping action)
2895          */
2896         if (child != current) {
2897                 wait_task_inactive(child, 0);
2898                 list_del_init(&child_counter->list_entry);
2899                 update_counter_times(child_counter);
2900         } else {
2901                 struct perf_cpu_context *cpuctx;
2902                 unsigned long flags;
2903                 u64 perf_flags;
2904
2905                 /*
2906                  * Disable and unlink this counter.
2907                  *
2908                  * Be careful about zapping the list - IRQ/NMI context
2909                  * could still be processing it:
2910                  */
2911                 curr_rq_lock_irq_save(&flags);
2912                 perf_flags = hw_perf_save_disable();
2913
2914                 cpuctx = &__get_cpu_var(perf_cpu_context);
2915
2916                 group_sched_out(child_counter, cpuctx, child_ctx);
2917                 update_counter_times(child_counter);
2918
2919                 list_del_init(&child_counter->list_entry);
2920
2921                 child_ctx->nr_counters--;
2922
2923                 hw_perf_restore(perf_flags);
2924                 curr_rq_unlock_irq_restore(&flags);
2925         }
2926
2927         parent_counter = child_counter->parent;
2928         /*
2929          * It can happen that parent exits first, and has counters
2930          * that are still around due to the child reference. These
2931          * counters need to be zapped - but otherwise linger.
2932          */
2933         if (parent_counter) {
2934                 sync_child_counter(child_counter, parent_counter);
2935                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2936                                          list_entry) {
2937                         if (sub->parent) {
2938                                 sync_child_counter(sub, sub->parent);
2939                                 free_counter(sub);
2940                         }
2941                 }
2942                 free_counter(child_counter);
2943         }
2944 }
2945
2946 /*
2947  * When a child task exits, feed back counter values to parent counters.
2948  *
2949  * Note: we may be running in child context, but the PID is not hashed
2950  * anymore so new counters will not be added.
2951  */
2952 void perf_counter_exit_task(struct task_struct *child)
2953 {
2954         struct perf_counter *child_counter, *tmp;
2955         struct perf_counter_context *child_ctx;
2956
2957         child_ctx = &child->perf_counter_ctx;
2958
2959         if (likely(!child_ctx->nr_counters))
2960                 return;
2961
2962         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2963                                  list_entry)
2964                 __perf_counter_exit_task(child, child_counter, child_ctx);
2965 }
2966
2967 /*
2968  * Initialize the perf_counter context in task_struct
2969  */
2970 void perf_counter_init_task(struct task_struct *child)
2971 {
2972         struct perf_counter_context *child_ctx, *parent_ctx;
2973         struct perf_counter *counter;
2974         struct task_struct *parent = current;
2975
2976         child_ctx  =  &child->perf_counter_ctx;
2977         parent_ctx = &parent->perf_counter_ctx;
2978
2979         __perf_counter_init_context(child_ctx, child);
2980
2981         /*
2982          * This is executed from the parent task context, so inherit
2983          * counters that have been marked for cloning:
2984          */
2985
2986         if (likely(!parent_ctx->nr_counters))
2987                 return;
2988
2989         /*
2990          * Lock the parent list. No need to lock the child - not PID
2991          * hashed yet and not running, so nobody can access it.
2992          */
2993         mutex_lock(&parent_ctx->mutex);
2994
2995         /*
2996          * We dont have to disable NMIs - we are only looking at
2997          * the list, not manipulating it:
2998          */
2999         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
3000                 if (!counter->hw_event.inherit)
3001                         continue;
3002
3003                 if (inherit_group(counter, parent,
3004                                   parent_ctx, child, child_ctx))
3005                         break;
3006         }
3007
3008         mutex_unlock(&parent_ctx->mutex);
3009 }
3010
3011 static void __cpuinit perf_counter_init_cpu(int cpu)
3012 {
3013         struct perf_cpu_context *cpuctx;
3014
3015         cpuctx = &per_cpu(perf_cpu_context, cpu);
3016         __perf_counter_init_context(&cpuctx->ctx, NULL);
3017
3018         mutex_lock(&perf_resource_mutex);
3019         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
3020         mutex_unlock(&perf_resource_mutex);
3021
3022         hw_perf_counter_setup(cpu);
3023 }
3024
3025 #ifdef CONFIG_HOTPLUG_CPU
3026 static void __perf_counter_exit_cpu(void *info)
3027 {
3028         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3029         struct perf_counter_context *ctx = &cpuctx->ctx;
3030         struct perf_counter *counter, *tmp;
3031
3032         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3033                 __perf_counter_remove_from_context(counter);
3034 }
3035 static void perf_counter_exit_cpu(int cpu)
3036 {
3037         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3038         struct perf_counter_context *ctx = &cpuctx->ctx;
3039
3040         mutex_lock(&ctx->mutex);
3041         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
3042         mutex_unlock(&ctx->mutex);
3043 }
3044 #else
3045 static inline void perf_counter_exit_cpu(int cpu) { }
3046 #endif
3047
3048 static int __cpuinit
3049 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3050 {
3051         unsigned int cpu = (long)hcpu;
3052
3053         switch (action) {
3054
3055         case CPU_UP_PREPARE:
3056         case CPU_UP_PREPARE_FROZEN:
3057                 perf_counter_init_cpu(cpu);
3058                 break;
3059
3060         case CPU_DOWN_PREPARE:
3061         case CPU_DOWN_PREPARE_FROZEN:
3062                 perf_counter_exit_cpu(cpu);
3063                 break;
3064
3065         default:
3066                 break;
3067         }
3068
3069         return NOTIFY_OK;
3070 }
3071
3072 static struct notifier_block __cpuinitdata perf_cpu_nb = {
3073         .notifier_call          = perf_cpu_notify,
3074 };
3075
3076 static int __init perf_counter_init(void)
3077 {
3078         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3079                         (void *)(long)smp_processor_id());
3080         register_cpu_notifier(&perf_cpu_nb);
3081
3082         return 0;
3083 }
3084 early_initcall(perf_counter_init);
3085
3086 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3087 {
3088         return sprintf(buf, "%d\n", perf_reserved_percpu);
3089 }
3090
3091 static ssize_t
3092 perf_set_reserve_percpu(struct sysdev_class *class,
3093                         const char *buf,
3094                         size_t count)
3095 {
3096         struct perf_cpu_context *cpuctx;
3097         unsigned long val;
3098         int err, cpu, mpt;
3099
3100         err = strict_strtoul(buf, 10, &val);
3101         if (err)
3102                 return err;
3103         if (val > perf_max_counters)
3104                 return -EINVAL;
3105
3106         mutex_lock(&perf_resource_mutex);
3107         perf_reserved_percpu = val;
3108         for_each_online_cpu(cpu) {
3109                 cpuctx = &per_cpu(perf_cpu_context, cpu);
3110                 spin_lock_irq(&cpuctx->ctx.lock);
3111                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3112                           perf_max_counters - perf_reserved_percpu);
3113                 cpuctx->max_pertask = mpt;
3114                 spin_unlock_irq(&cpuctx->ctx.lock);
3115         }
3116         mutex_unlock(&perf_resource_mutex);
3117
3118         return count;
3119 }
3120
3121 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3122 {
3123         return sprintf(buf, "%d\n", perf_overcommit);
3124 }
3125
3126 static ssize_t
3127 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3128 {
3129         unsigned long val;
3130         int err;
3131
3132         err = strict_strtoul(buf, 10, &val);
3133         if (err)
3134                 return err;
3135         if (val > 1)
3136                 return -EINVAL;
3137
3138         mutex_lock(&perf_resource_mutex);
3139         perf_overcommit = val;
3140         mutex_unlock(&perf_resource_mutex);
3141
3142         return count;
3143 }
3144
3145 static SYSDEV_CLASS_ATTR(
3146                                 reserve_percpu,
3147                                 0644,
3148                                 perf_show_reserve_percpu,
3149                                 perf_set_reserve_percpu
3150                         );
3151
3152 static SYSDEV_CLASS_ATTR(
3153                                 overcommit,
3154                                 0644,
3155                                 perf_show_overcommit,
3156                                 perf_set_overcommit
3157                         );
3158
3159 static struct attribute *perfclass_attrs[] = {
3160         &attr_reserve_percpu.attr,
3161         &attr_overcommit.attr,
3162         NULL
3163 };
3164
3165 static struct attribute_group perfclass_attr_group = {
3166         .attrs                  = perfclass_attrs,
3167         .name                   = "perf_counters",
3168 };
3169
3170 static int __init perf_counter_sysfs_init(void)
3171 {
3172         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3173                                   &perfclass_attr_group);
3174 }
3175 device_initcall(perf_counter_sysfs_init);