kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *
   8  *  For licensing details see kernel-base/COPYING
   9  */
  10
  11 #include <linux/fs.h>
  12 #include <linux/mm.h>
  13 #include <linux/cpu.h>
  14 #include <linux/smp.h>
  15 #include <linux/file.h>
  16 #include <linux/poll.h>
  17 #include <linux/sysfs.h>
  18 #include <linux/ptrace.h>
  19 #include <linux/percpu.h>
  20 #include <linux/vmstat.h>
  21 #include <linux/hardirq.h>
  22 #include <linux/rculist.h>
  23 #include <linux/uaccess.h>
  24 #include <linux/syscalls.h>
  25 #include <linux/anon_inodes.h>
  26 #include <linux/kernel_stat.h>
  27 #include <linux/perf_counter.h>
  28 #include <linux/dcache.h>
  29
  30 #include <asm/irq_regs.h>
  31
  32 /*
  33  * Each CPU has a list of per CPU counters:
  34  */
  35 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  36
  37 int perf_max_counters __read_mostly = 1;
  38 static int perf_reserved_percpu __read_mostly;
  39 static int perf_overcommit __read_mostly = 1;
  40
  41 /*
  42  * Mutex for (sysadmin-configurable) counter reservations:
  43  */
  44 static DEFINE_MUTEX(perf_resource_mutex);
  45
  46 /*
  47  * Architecture provided APIs - weak aliases:
  48  */
  49 extern __weak const struct hw_perf_counter_ops *
  50 hw_perf_counter_init(struct perf_counter *counter)
  51 {
  52         return NULL;
  53 }
  54
  55 u64 __weak hw_perf_save_disable(void)           { return 0; }
  56 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  57 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  58 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  59                struct perf_cpu_context *cpuctx,
  60                struct perf_counter_context *ctx, int cpu)
  61 {
  62         return 0;
  63 }
  64
  65 void __weak perf_counter_print_debug(void)      { }
  66
  67 static void
  68 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  69 {
  70         struct perf_counter *group_leader = counter->group_leader;
  71
  72         /*
  73          * Depending on whether it is a standalone or sibling counter,
  74          * add it straight to the context's counter list, or to the group
  75          * leader's sibling list:
  76          */
  77         if (counter->group_leader == counter)
  78                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  79         else {
  80                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  81                 group_leader->nr_siblings++;
  82         }
  83
  84         list_add_rcu(&counter->event_entry, &ctx->event_list);
  85 }
  86
  87 static void
  88 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  89 {
  90         struct perf_counter *sibling, *tmp;
  91
  92         list_del_init(&counter->list_entry);
  93         list_del_rcu(&counter->event_entry);
  94
  95         if (counter->group_leader != counter)
  96                 counter->group_leader->nr_siblings--;
  97
  98         /*
  99          * If this was a group counter with sibling counters then
 100          * upgrade the siblings to singleton counters by adding them
 101          * to the context list directly:
 102          */
 103         list_for_each_entry_safe(sibling, tmp,
 104                                  &counter->sibling_list, list_entry) {
 105
 106                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
 107                 sibling->group_leader = sibling;
 108         }
 109 }
 110
 111 static void
 112 counter_sched_out(struct perf_counter *counter,
 113                   struct perf_cpu_context *cpuctx,
 114                   struct perf_counter_context *ctx)
 115 {
 116         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 117                 return;
 118
 119         counter->state = PERF_COUNTER_STATE_INACTIVE;
 120         counter->tstamp_stopped = ctx->time;
 121         counter->hw_ops->disable(counter);
 122         counter->oncpu = -1;
 123
 124         if (!is_software_counter(counter))
 125                 cpuctx->active_oncpu--;
 126         ctx->nr_active--;
 127         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 128                 cpuctx->exclusive = 0;
 129 }
 130
 131 static void
 132 group_sched_out(struct perf_counter *group_counter,
 133                 struct perf_cpu_context *cpuctx,
 134                 struct perf_counter_context *ctx)
 135 {
 136         struct perf_counter *counter;
 137
 138         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 139                 return;
 140
 141         counter_sched_out(group_counter, cpuctx, ctx);
 142
 143         /*
 144          * Schedule out siblings (if any):
 145          */
 146         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 147                 counter_sched_out(counter, cpuctx, ctx);
 148
 149         if (group_counter->hw_event.exclusive)
 150                 cpuctx->exclusive = 0;
 151 }
 152
 153 /*
 154  * Cross CPU call to remove a performance counter
 155  *
 156  * We disable the counter on the hardware level first. After that we
 157  * remove it from the context list.
 158  */
 159 static void __perf_counter_remove_from_context(void *info)
 160 {
 161         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 162         struct perf_counter *counter = info;
 163         struct perf_counter_context *ctx = counter->ctx;
 164         unsigned long flags;
 165         u64 perf_flags;
 166
 167         /*
 168          * If this is a task context, we need to check whether it is
 169          * the current task context of this cpu. If not it has been
 170          * scheduled out before the smp call arrived.
 171          */
 172         if (ctx->task && cpuctx->task_ctx != ctx)
 173                 return;
 174
 175         spin_lock_irqsave(&ctx->lock, flags);
 176
 177         counter_sched_out(counter, cpuctx, ctx);
 178
 179         counter->task = NULL;
 180         ctx->nr_counters--;
 181
 182         /*
 183          * Protect the list operation against NMI by disabling the
 184          * counters on a global level. NOP for non NMI based counters.
 185          */
 186         perf_flags = hw_perf_save_disable();
 187         list_del_counter(counter, ctx);
 188         hw_perf_restore(perf_flags);
 189
 190         if (!ctx->task) {
 191                 /*
 192                  * Allow more per task counters with respect to the
 193                  * reservation:
 194                  */
 195                 cpuctx->max_pertask =
 196                         min(perf_max_counters - ctx->nr_counters,
 197                             perf_max_counters - perf_reserved_percpu);
 198         }
 199
 200         spin_unlock_irqrestore(&ctx->lock, flags);
 201 }
 202
 203
 204 /*
 205  * Remove the counter from a task's (or a CPU's) list of counters.
 206  *
 207  * Must be called with counter->mutex and ctx->mutex held.
 208  *
 209  * CPU counters are removed with a smp call. For task counters we only
 210  * call when the task is on a CPU.
 211  */
 212 static void perf_counter_remove_from_context(struct perf_counter *counter)
 213 {
 214         struct perf_counter_context *ctx = counter->ctx;
 215         struct task_struct *task = ctx->task;
 216
 217         if (!task) {
 218                 /*
 219                  * Per cpu counters are removed via an smp call and
 220                  * the removal is always sucessful.
 221                  */
 222                 smp_call_function_single(counter->cpu,
 223                                          __perf_counter_remove_from_context,
 224                                          counter, 1);
 225                 return;
 226         }
 227
 228 retry:
 229         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 230                                  counter);
 231
 232         spin_lock_irq(&ctx->lock);
 233         /*
 234          * If the context is active we need to retry the smp call.
 235          */
 236         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 237                 spin_unlock_irq(&ctx->lock);
 238                 goto retry;
 239         }
 240
 241         /*
 242          * The lock prevents that this context is scheduled in so we
 243          * can remove the counter safely, if the call above did not
 244          * succeed.
 245          */
 246         if (!list_empty(&counter->list_entry)) {
 247                 ctx->nr_counters--;
 248                 list_del_counter(counter, ctx);
 249                 counter->task = NULL;
 250         }
 251         spin_unlock_irq(&ctx->lock);
 252 }
 253
 254 static inline u64 perf_clock(void)
 255 {
 256         return cpu_clock(smp_processor_id());
 257 }
 258
 259 /*
 260  * Update the record of the current time in a context.
 261  */
 262 static void update_context_time(struct perf_counter_context *ctx)
 263 {
 264         u64 now = perf_clock();
 265
 266         ctx->time += now - ctx->timestamp;
 267         ctx->timestamp = now;
 268 }
 269
 270 /*
 271  * Update the total_time_enabled and total_time_running fields for a counter.
 272  */
 273 static void update_counter_times(struct perf_counter *counter)
 274 {
 275         struct perf_counter_context *ctx = counter->ctx;
 276         u64 run_end;
 277
 278         if (counter->state < PERF_COUNTER_STATE_INACTIVE)
 279                 return;
 280
 281         counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
 282
 283         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 284                 run_end = counter->tstamp_stopped;
 285         else
 286                 run_end = ctx->time;
 287
 288         counter->total_time_running = run_end - counter->tstamp_running;
 289 }
 290
 291 /*
 292  * Update total_time_enabled and total_time_running for all counters in a group.
 293  */
 294 static void update_group_times(struct perf_counter *leader)
 295 {
 296         struct perf_counter *counter;
 297
 298         update_counter_times(leader);
 299         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 300                 update_counter_times(counter);
 301 }
 302
 303 /*
 304  * Cross CPU call to disable a performance counter
 305  */
 306 static void __perf_counter_disable(void *info)
 307 {
 308         struct perf_counter *counter = info;
 309         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 310         struct perf_counter_context *ctx = counter->ctx;
 311         unsigned long flags;
 312
 313         /*
 314          * If this is a per-task counter, need to check whether this
 315          * counter's task is the current task on this cpu.
 316          */
 317         if (ctx->task && cpuctx->task_ctx != ctx)
 318                 return;
 319
 320         spin_lock_irqsave(&ctx->lock, flags);
 321
 322         update_context_time(ctx);
 323
 324         /*
 325          * If the counter is on, turn it off.
 326          * If it is in error state, leave it in error state.
 327          */
 328         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 329                 update_context_time(ctx);
 330                 update_counter_times(counter);
 331                 if (counter == counter->group_leader)
 332                         group_sched_out(counter, cpuctx, ctx);
 333                 else
 334                         counter_sched_out(counter, cpuctx, ctx);
 335                 counter->state = PERF_COUNTER_STATE_OFF;
 336         }
 337
 338         spin_unlock_irqrestore(&ctx->lock, flags);
 339 }
 340
 341 /*
 342  * Disable a counter.
 343  */
 344 static void perf_counter_disable(struct perf_counter *counter)
 345 {
 346         struct perf_counter_context *ctx = counter->ctx;
 347         struct task_struct *task = ctx->task;
 348
 349         if (!task) {
 350                 /*
 351                  * Disable the counter on the cpu that it's on
 352                  */
 353                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 354                                          counter, 1);
 355                 return;
 356         }
 357
 358  retry:
 359         task_oncpu_function_call(task, __perf_counter_disable, counter);
 360
 361         spin_lock_irq(&ctx->lock);
 362         /*
 363          * If the counter is still active, we need to retry the cross-call.
 364          */
 365         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 366                 spin_unlock_irq(&ctx->lock);
 367                 goto retry;
 368         }
 369
 370         /*
 371          * Since we have the lock this context can't be scheduled
 372          * in, so we can change the state safely.
 373          */
 374         if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 375                 update_counter_times(counter);
 376                 counter->state = PERF_COUNTER_STATE_OFF;
 377         }
 378
 379         spin_unlock_irq(&ctx->lock);
 380 }
 381
 382 /*
 383  * Disable a counter and all its children.
 384  */
 385 static void perf_counter_disable_family(struct perf_counter *counter)
 386 {
 387         struct perf_counter *child;
 388
 389         perf_counter_disable(counter);
 390
 391         /*
 392          * Lock the mutex to protect the list of children
 393          */
 394         mutex_lock(&counter->mutex);
 395         list_for_each_entry(child, &counter->child_list, child_list)
 396                 perf_counter_disable(child);
 397         mutex_unlock(&counter->mutex);
 398 }
 399
 400 static int
 401 counter_sched_in(struct perf_counter *counter,
 402                  struct perf_cpu_context *cpuctx,
 403                  struct perf_counter_context *ctx,
 404                  int cpu)
 405 {
 406         if (counter->state <= PERF_COUNTER_STATE_OFF)
 407                 return 0;
 408
 409         counter->state = PERF_COUNTER_STATE_ACTIVE;
 410         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 411         /*
 412          * The new state must be visible before we turn it on in the hardware:
 413          */
 414         smp_wmb();
 415
 416         if (counter->hw_ops->enable(counter)) {
 417                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 418                 counter->oncpu = -1;
 419                 return -EAGAIN;
 420         }
 421
 422         counter->tstamp_running += ctx->time - counter->tstamp_stopped;
 423
 424         if (!is_software_counter(counter))
 425                 cpuctx->active_oncpu++;
 426         ctx->nr_active++;
 427
 428         if (counter->hw_event.exclusive)
 429                 cpuctx->exclusive = 1;
 430
 431         return 0;
 432 }
 433
 434 /*
 435  * Return 1 for a group consisting entirely of software counters,
 436  * 0 if the group contains any hardware counters.
 437  */
 438 static int is_software_only_group(struct perf_counter *leader)
 439 {
 440         struct perf_counter *counter;
 441
 442         if (!is_software_counter(leader))
 443                 return 0;
 444
 445         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 446                 if (!is_software_counter(counter))
 447                         return 0;
 448
 449         return 1;
 450 }
 451
 452 /*
 453  * Work out whether we can put this counter group on the CPU now.
 454  */
 455 static int group_can_go_on(struct perf_counter *counter,
 456                            struct perf_cpu_context *cpuctx,
 457                            int can_add_hw)
 458 {
 459         /*
 460          * Groups consisting entirely of software counters can always go on.
 461          */
 462         if (is_software_only_group(counter))
 463                 return 1;
 464         /*
 465          * If an exclusive group is already on, no other hardware
 466          * counters can go on.
 467          */
 468         if (cpuctx->exclusive)
 469                 return 0;
 470         /*
 471          * If this group is exclusive and there are already
 472          * counters on the CPU, it can't go on.
 473          */
 474         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 475                 return 0;
 476         /*
 477          * Otherwise, try to add it if all previous groups were able
 478          * to go on.
 479          */
 480         return can_add_hw;
 481 }
 482
 483 static void add_counter_to_ctx(struct perf_counter *counter,
 484                                struct perf_counter_context *ctx)
 485 {
 486         list_add_counter(counter, ctx);
 487         ctx->nr_counters++;
 488         counter->prev_state = PERF_COUNTER_STATE_OFF;
 489         counter->tstamp_enabled = ctx->time;
 490         counter->tstamp_running = ctx->time;
 491         counter->tstamp_stopped = ctx->time;
 492 }
 493
 494 /*
 495  * Cross CPU call to install and enable a performance counter
 496  */
 497 static void __perf_install_in_context(void *info)
 498 {
 499         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 500         struct perf_counter *counter = info;
 501         struct perf_counter_context *ctx = counter->ctx;
 502         struct perf_counter *leader = counter->group_leader;
 503         int cpu = smp_processor_id();
 504         unsigned long flags;
 505         u64 perf_flags;
 506         int err;
 507
 508         /*
 509          * If this is a task context, we need to check whether it is
 510          * the current task context of this cpu. If not it has been
 511          * scheduled out before the smp call arrived.
 512          */
 513         if (ctx->task && cpuctx->task_ctx != ctx)
 514                 return;
 515
 516         spin_lock_irqsave(&ctx->lock, flags);
 517         update_context_time(ctx);
 518
 519         /*
 520          * Protect the list operation against NMI by disabling the
 521          * counters on a global level. NOP for non NMI based counters.
 522          */
 523         perf_flags = hw_perf_save_disable();
 524
 525         add_counter_to_ctx(counter, ctx);
 526
 527         /*
 528          * Don't put the counter on if it is disabled or if
 529          * it is in a group and the group isn't on.
 530          */
 531         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 532             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 533                 goto unlock;
 534
 535         /*
 536          * An exclusive counter can't go on if there are already active
 537          * hardware counters, and no hardware counter can go on if there
 538          * is already an exclusive counter on.
 539          */
 540         if (!group_can_go_on(counter, cpuctx, 1))
 541                 err = -EEXIST;
 542         else
 543                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 544
 545         if (err) {
 546                 /*
 547                  * This counter couldn't go on.  If it is in a group
 548                  * then we have to pull the whole group off.
 549                  * If the counter group is pinned then put it in error state.
 550                  */
 551                 if (leader != counter)
 552                         group_sched_out(leader, cpuctx, ctx);
 553                 if (leader->hw_event.pinned) {
 554                         update_group_times(leader);
 555                         leader->state = PERF_COUNTER_STATE_ERROR;
 556                 }
 557         }
 558
 559         if (!err && !ctx->task && cpuctx->max_pertask)
 560                 cpuctx->max_pertask--;
 561
 562  unlock:
 563         hw_perf_restore(perf_flags);
 564
 565         spin_unlock_irqrestore(&ctx->lock, flags);
 566 }
 567
 568 /*
 569  * Attach a performance counter to a context
 570  *
 571  * First we add the counter to the list with the hardware enable bit
 572  * in counter->hw_config cleared.
 573  *
 574  * If the counter is attached to a task which is on a CPU we use a smp
 575  * call to enable it in the task context. The task might have been
 576  * scheduled away, but we check this in the smp call again.
 577  *
 578  * Must be called with ctx->mutex held.
 579  */
 580 static void
 581 perf_install_in_context(struct perf_counter_context *ctx,
 582                         struct perf_counter *counter,
 583                         int cpu)
 584 {
 585         struct task_struct *task = ctx->task;
 586
 587         if (!task) {
 588                 /*
 589                  * Per cpu counters are installed via an smp call and
 590                  * the install is always sucessful.
 591                  */
 592                 smp_call_function_single(cpu, __perf_install_in_context,
 593                                          counter, 1);
 594                 return;
 595         }
 596
 597         counter->task = task;
 598 retry:
 599         task_oncpu_function_call(task, __perf_install_in_context,
 600                                  counter);
 601
 602         spin_lock_irq(&ctx->lock);
 603         /*
 604          * we need to retry the smp call.
 605          */
 606         if (ctx->is_active && list_empty(&counter->list_entry)) {
 607                 spin_unlock_irq(&ctx->lock);
 608                 goto retry;
 609         }
 610
 611         /*
 612          * The lock prevents that this context is scheduled in so we
 613          * can add the counter safely, if it the call above did not
 614          * succeed.
 615          */
 616         if (list_empty(&counter->list_entry))
 617                 add_counter_to_ctx(counter, ctx);
 618         spin_unlock_irq(&ctx->lock);
 619 }
 620
 621 /*
 622  * Cross CPU call to enable a performance counter
 623  */
 624 static void __perf_counter_enable(void *info)
 625 {
 626         struct perf_counter *counter = info;
 627         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 628         struct perf_counter_context *ctx = counter->ctx;
 629         struct perf_counter *leader = counter->group_leader;
 630         unsigned long flags;
 631         int err;
 632
 633         /*
 634          * If this is a per-task counter, need to check whether this
 635          * counter's task is the current task on this cpu.
 636          */
 637         if (ctx->task && cpuctx->task_ctx != ctx)
 638                 return;
 639
 640         spin_lock_irqsave(&ctx->lock, flags);
 641         update_context_time(ctx);
 642
 643         counter->prev_state = counter->state;
 644         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 645                 goto unlock;
 646         counter->state = PERF_COUNTER_STATE_INACTIVE;
 647         counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 648
 649         /*
 650          * If the counter is in a group and isn't the group leader,
 651          * then don't put it on unless the group is on.
 652          */
 653         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 654                 goto unlock;
 655
 656         if (!group_can_go_on(counter, cpuctx, 1))
 657                 err = -EEXIST;
 658         else
 659                 err = counter_sched_in(counter, cpuctx, ctx,
 660                                        smp_processor_id());
 661
 662         if (err) {
 663                 /*
 664                  * If this counter can't go on and it's part of a
 665                  * group, then the whole group has to come off.
 666                  */
 667                 if (leader != counter)
 668                         group_sched_out(leader, cpuctx, ctx);
 669                 if (leader->hw_event.pinned) {
 670                         update_group_times(leader);
 671                         leader->state = PERF_COUNTER_STATE_ERROR;
 672                 }
 673         }
 674
 675  unlock:
 676         spin_unlock_irqrestore(&ctx->lock, flags);
 677 }
 678
 679 /*
 680  * Enable a counter.
 681  */
 682 static void perf_counter_enable(struct perf_counter *counter)
 683 {
 684         struct perf_counter_context *ctx = counter->ctx;
 685         struct task_struct *task = ctx->task;
 686
 687         if (!task) {
 688                 /*
 689                  * Enable the counter on the cpu that it's on
 690                  */
 691                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 692                                          counter, 1);
 693                 return;
 694         }
 695
 696         spin_lock_irq(&ctx->lock);
 697         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 698                 goto out;
 699
 700         /*
 701          * If the counter is in error state, clear that first.
 702          * That way, if we see the counter in error state below, we
 703          * know that it has gone back into error state, as distinct
 704          * from the task having been scheduled away before the
 705          * cross-call arrived.
 706          */
 707         if (counter->state == PERF_COUNTER_STATE_ERROR)
 708                 counter->state = PERF_COUNTER_STATE_OFF;
 709
 710  retry:
 711         spin_unlock_irq(&ctx->lock);
 712         task_oncpu_function_call(task, __perf_counter_enable, counter);
 713
 714         spin_lock_irq(&ctx->lock);
 715
 716         /*
 717          * If the context is active and the counter is still off,
 718          * we need to retry the cross-call.
 719          */
 720         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 721                 goto retry;
 722
 723         /*
 724          * Since we have the lock this context can't be scheduled
 725          * in, so we can change the state safely.
 726          */
 727         if (counter->state == PERF_COUNTER_STATE_OFF) {
 728                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 729                 counter->tstamp_enabled =
 730                         ctx->time - counter->total_time_enabled;
 731         }
 732  out:
 733         spin_unlock_irq(&ctx->lock);
 734 }
 735
 736 static void perf_counter_refresh(struct perf_counter *counter, int refresh)
 737 {
 738         atomic_add(refresh, &counter->event_limit);
 739         perf_counter_enable(counter);
 740 }
 741
 742 /*
 743  * Enable a counter and all its children.
 744  */
 745 static void perf_counter_enable_family(struct perf_counter *counter)
 746 {
 747         struct perf_counter *child;
 748
 749         perf_counter_enable(counter);
 750
 751         /*
 752          * Lock the mutex to protect the list of children
 753          */
 754         mutex_lock(&counter->mutex);
 755         list_for_each_entry(child, &counter->child_list, child_list)
 756                 perf_counter_enable(child);
 757         mutex_unlock(&counter->mutex);
 758 }
 759
 760 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 761                               struct perf_cpu_context *cpuctx)
 762 {
 763         struct perf_counter *counter;
 764         u64 flags;
 765
 766         spin_lock(&ctx->lock);
 767         ctx->is_active = 0;
 768         if (likely(!ctx->nr_counters))
 769                 goto out;
 770         update_context_time(ctx);
 771
 772         flags = hw_perf_save_disable();
 773         if (ctx->nr_active) {
 774                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 775                         group_sched_out(counter, cpuctx, ctx);
 776         }
 777         hw_perf_restore(flags);
 778  out:
 779         spin_unlock(&ctx->lock);
 780 }
 781
 782 /*
 783  * Called from scheduler to remove the counters of the current task,
 784  * with interrupts disabled.
 785  *
 786  * We stop each counter and update the counter value in counter->count.
 787  *
 788  * This does not protect us against NMI, but disable()
 789  * sets the disabled bit in the control field of counter _before_
 790  * accessing the counter control register. If a NMI hits, then it will
 791  * not restart the counter.
 792  */
 793 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 794 {
 795         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 796         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 797         struct pt_regs *regs;
 798
 799         if (likely(!cpuctx->task_ctx))
 800                 return;
 801
 802         update_context_time(ctx);
 803
 804         regs = task_pt_regs(task);
 805         perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 806         __perf_counter_sched_out(ctx, cpuctx);
 807
 808         cpuctx->task_ctx = NULL;
 809 }
 810
 811 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 812 {
 813         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 814 }
 815
 816 static int
 817 group_sched_in(struct perf_counter *group_counter,
 818                struct perf_cpu_context *cpuctx,
 819                struct perf_counter_context *ctx,
 820                int cpu)
 821 {
 822         struct perf_counter *counter, *partial_group;
 823         int ret;
 824
 825         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 826                 return 0;
 827
 828         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 829         if (ret)
 830                 return ret < 0 ? ret : 0;
 831
 832         group_counter->prev_state = group_counter->state;
 833         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 834                 return -EAGAIN;
 835
 836         /*
 837          * Schedule in siblings as one group (if any):
 838          */
 839         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 840                 counter->prev_state = counter->state;
 841                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 842                         partial_group = counter;
 843                         goto group_error;
 844                 }
 845         }
 846
 847         return 0;
 848
 849 group_error:
 850         /*
 851          * Groups can be scheduled in as one unit only, so undo any
 852          * partial group before returning:
 853          */
 854         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 855                 if (counter == partial_group)
 856                         break;
 857                 counter_sched_out(counter, cpuctx, ctx);
 858         }
 859         counter_sched_out(group_counter, cpuctx, ctx);
 860
 861         return -EAGAIN;
 862 }
 863
 864 static void
 865 __perf_counter_sched_in(struct perf_counter_context *ctx,
 866                         struct perf_cpu_context *cpuctx, int cpu)
 867 {
 868         struct perf_counter *counter;
 869         u64 flags;
 870         int can_add_hw = 1;
 871
 872         spin_lock(&ctx->lock);
 873         ctx->is_active = 1;
 874         if (likely(!ctx->nr_counters))
 875                 goto out;
 876
 877         ctx->timestamp = perf_clock();
 878
 879         flags = hw_perf_save_disable();
 880
 881         /*
 882          * First go through the list and put on any pinned groups
 883          * in order to give them the best chance of going on.
 884          */
 885         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 886                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 887                     !counter->hw_event.pinned)
 888                         continue;
 889                 if (counter->cpu != -1 && counter->cpu != cpu)
 890                         continue;
 891
 892                 if (group_can_go_on(counter, cpuctx, 1))
 893                         group_sched_in(counter, cpuctx, ctx, cpu);
 894
 895                 /*
 896                  * If this pinned group hasn't been scheduled,
 897                  * put it in error state.
 898                  */
 899                 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 900                         update_group_times(counter);
 901                         counter->state = PERF_COUNTER_STATE_ERROR;
 902                 }
 903         }
 904
 905         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 906                 /*
 907                  * Ignore counters in OFF or ERROR state, and
 908                  * ignore pinned counters since we did them already.
 909                  */
 910                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 911                     counter->hw_event.pinned)
 912                         continue;
 913
 914                 /*
 915                  * Listen to the 'cpu' scheduling filter constraint
 916                  * of counters:
 917                  */
 918                 if (counter->cpu != -1 && counter->cpu != cpu)
 919                         continue;
 920
 921                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 922                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 923                                 can_add_hw = 0;
 924                 }
 925         }
 926         hw_perf_restore(flags);
 927  out:
 928         spin_unlock(&ctx->lock);
 929 }
 930
 931 /*
 932  * Called from scheduler to add the counters of the current task
 933  * with interrupts disabled.
 934  *
 935  * We restore the counter value and then enable it.
 936  *
 937  * This does not protect us against NMI, but enable()
 938  * sets the enabled bit in the control field of counter _before_
 939  * accessing the counter control register. If a NMI hits, then it will
 940  * keep the counter running.
 941  */
 942 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 943 {
 944         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 945         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 946
 947         __perf_counter_sched_in(ctx, cpuctx, cpu);
 948         cpuctx->task_ctx = ctx;
 949 }
 950
 951 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 952 {
 953         struct perf_counter_context *ctx = &cpuctx->ctx;
 954
 955         __perf_counter_sched_in(ctx, cpuctx, cpu);
 956 }
 957
 958 int perf_counter_task_disable(void)
 959 {
 960         struct task_struct *curr = current;
 961         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 962         struct perf_counter *counter;
 963         unsigned long flags;
 964         u64 perf_flags;
 965         int cpu;
 966
 967         if (likely(!ctx->nr_counters))
 968                 return 0;
 969
 970         local_irq_save(flags);
 971         cpu = smp_processor_id();
 972
 973         perf_counter_task_sched_out(curr, cpu);
 974
 975         spin_lock(&ctx->lock);
 976
 977         /*
 978          * Disable all the counters:
 979          */
 980         perf_flags = hw_perf_save_disable();
 981
 982         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 983                 if (counter->state != PERF_COUNTER_STATE_ERROR) {
 984                         update_group_times(counter);
 985                         counter->state = PERF_COUNTER_STATE_OFF;
 986                 }
 987         }
 988
 989         hw_perf_restore(perf_flags);
 990
 991         spin_unlock_irqrestore(&ctx->lock, flags);
 992
 993         return 0;
 994 }
 995
 996 int perf_counter_task_enable(void)
 997 {
 998         struct task_struct *curr = current;
 999         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1000         struct perf_counter *counter;
1001         unsigned long flags;
1002         u64 perf_flags;
1003         int cpu;
1004
1005         if (likely(!ctx->nr_counters))
1006                 return 0;
1007
1008         local_irq_save(flags);
1009         cpu = smp_processor_id();
1010
1011         perf_counter_task_sched_out(curr, cpu);
1012
1013         spin_lock(&ctx->lock);
1014
1015         /*
1016          * Disable all the counters:
1017          */
1018         perf_flags = hw_perf_save_disable();
1019
1020         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1021                 if (counter->state > PERF_COUNTER_STATE_OFF)
1022                         continue;
1023                 counter->state = PERF_COUNTER_STATE_INACTIVE;
1024                 counter->tstamp_enabled =
1025                         ctx->time - counter->total_time_enabled;
1026                 counter->hw_event.disabled = 0;
1027         }
1028         hw_perf_restore(perf_flags);
1029
1030         spin_unlock(&ctx->lock);
1031
1032         perf_counter_task_sched_in(curr, cpu);
1033
1034         local_irq_restore(flags);
1035
1036         return 0;
1037 }
1038
1039 /*
1040  * Round-robin a context's counters:
1041  */
1042 static void rotate_ctx(struct perf_counter_context *ctx)
1043 {
1044         struct perf_counter *counter;
1045         u64 perf_flags;
1046
1047         if (!ctx->nr_counters)
1048                 return;
1049
1050         spin_lock(&ctx->lock);
1051         /*
1052          * Rotate the first entry last (works just fine for group counters too):
1053          */
1054         perf_flags = hw_perf_save_disable();
1055         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
1056                 list_move_tail(&counter->list_entry, &ctx->counter_list);
1057                 break;
1058         }
1059         hw_perf_restore(perf_flags);
1060
1061         spin_unlock(&ctx->lock);
1062 }
1063
1064 void perf_counter_task_tick(struct task_struct *curr, int cpu)
1065 {
1066         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
1067         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
1068         const int rotate_percpu = 0;
1069
1070         if (rotate_percpu)
1071                 perf_counter_cpu_sched_out(cpuctx);
1072         perf_counter_task_sched_out(curr, cpu);
1073
1074         if (rotate_percpu)
1075                 rotate_ctx(&cpuctx->ctx);
1076         rotate_ctx(ctx);
1077
1078         if (rotate_percpu)
1079                 perf_counter_cpu_sched_in(cpuctx, cpu);
1080         perf_counter_task_sched_in(curr, cpu);
1081 }
1082
1083 /*
1084  * Cross CPU call to read the hardware counter
1085  */
1086 static void __read(void *info)
1087 {
1088         struct perf_counter *counter = info;
1089         struct perf_counter_context *ctx = counter->ctx;
1090         unsigned long flags;
1091
1092         local_irq_save(flags);
1093         if (ctx->is_active)
1094                 update_context_time(ctx);
1095         counter->hw_ops->read(counter);
1096         update_counter_times(counter);
1097         local_irq_restore(flags);
1098 }
1099
1100 static u64 perf_counter_read(struct perf_counter *counter)
1101 {
1102         /*
1103          * If counter is enabled and currently active on a CPU, update the
1104          * value in the counter structure:
1105          */
1106         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1107                 smp_call_function_single(counter->oncpu,
1108                                          __read, counter, 1);
1109         } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
1110                 update_counter_times(counter);
1111         }
1112
1113         return atomic64_read(&counter->count);
1114 }
1115
1116 static void put_context(struct perf_counter_context *ctx)
1117 {
1118         if (ctx->task)
1119                 put_task_struct(ctx->task);
1120 }
1121
1122 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1123 {
1124         struct perf_cpu_context *cpuctx;
1125         struct perf_counter_context *ctx;
1126         struct task_struct *task;
1127
1128         /*
1129          * If cpu is not a wildcard then this is a percpu counter:
1130          */
1131         if (cpu != -1) {
1132                 /* Must be root to operate on a CPU counter: */
1133                 if (!capable(CAP_SYS_ADMIN))
1134                         return ERR_PTR(-EACCES);
1135
1136                 if (cpu < 0 || cpu > num_possible_cpus())
1137                         return ERR_PTR(-EINVAL);
1138
1139                 /*
1140                  * We could be clever and allow to attach a counter to an
1141                  * offline CPU and activate it when the CPU comes up, but
1142                  * that's for later.
1143                  */
1144                 if (!cpu_isset(cpu, cpu_online_map))
1145                         return ERR_PTR(-ENODEV);
1146
1147                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1148                 ctx = &cpuctx->ctx;
1149
1150                 return ctx;
1151         }
1152
1153         rcu_read_lock();
1154         if (!pid)
1155                 task = current;
1156         else
1157                 task = find_task_by_vpid(pid);
1158         if (task)
1159                 get_task_struct(task);
1160         rcu_read_unlock();
1161
1162         if (!task)
1163                 return ERR_PTR(-ESRCH);
1164
1165         ctx = &task->perf_counter_ctx;
1166         ctx->task = task;
1167
1168         /* Reuse ptrace permission checks for now. */
1169         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1170                 put_context(ctx);
1171                 return ERR_PTR(-EACCES);
1172         }
1173
1174         return ctx;
1175 }
1176
1177 static void free_counter_rcu(struct rcu_head *head)
1178 {
1179         struct perf_counter *counter;
1180
1181         counter = container_of(head, struct perf_counter, rcu_head);
1182         kfree(counter);
1183 }
1184
1185 static void perf_pending_sync(struct perf_counter *counter);
1186
1187 static void free_counter(struct perf_counter *counter)
1188 {
1189         perf_pending_sync(counter);
1190
1191         if (counter->destroy)
1192                 counter->destroy(counter);
1193
1194         call_rcu(&counter->rcu_head, free_counter_rcu);
1195 }
1196
1197 /*
1198  * Called when the last reference to the file is gone.
1199  */
1200 static int perf_release(struct inode *inode, struct file *file)
1201 {
1202         struct perf_counter *counter = file->private_data;
1203         struct perf_counter_context *ctx = counter->ctx;
1204
1205         file->private_data = NULL;
1206
1207         mutex_lock(&ctx->mutex);
1208         mutex_lock(&counter->mutex);
1209
1210         perf_counter_remove_from_context(counter);
1211
1212         mutex_unlock(&counter->mutex);
1213         mutex_unlock(&ctx->mutex);
1214
1215         free_counter(counter);
1216         put_context(ctx);
1217
1218         return 0;
1219 }
1220
1221 /*
1222  * Read the performance counter - simple non blocking version for now
1223  */
1224 static ssize_t
1225 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1226 {
1227         u64 values[3];
1228         int n;
1229
1230         /*
1231          * Return end-of-file for a read on a counter that is in
1232          * error state (i.e. because it was pinned but it couldn't be
1233          * scheduled on to the CPU at some point).
1234          */
1235         if (counter->state == PERF_COUNTER_STATE_ERROR)
1236                 return 0;
1237
1238         mutex_lock(&counter->mutex);
1239         values[0] = perf_counter_read(counter);
1240         n = 1;
1241         if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1242                 values[n++] = counter->total_time_enabled +
1243                         atomic64_read(&counter->child_total_time_enabled);
1244         if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1245                 values[n++] = counter->total_time_running +
1246                         atomic64_read(&counter->child_total_time_running);
1247         mutex_unlock(&counter->mutex);
1248
1249         if (count < n * sizeof(u64))
1250                 return -EINVAL;
1251         count = n * sizeof(u64);
1252
1253         if (copy_to_user(buf, values, count))
1254                 return -EFAULT;
1255
1256         return count;
1257 }
1258
1259 static ssize_t
1260 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1261 {
1262         struct perf_counter *counter = file->private_data;
1263
1264         return perf_read_hw(counter, buf, count);
1265 }
1266
1267 static unsigned int perf_poll(struct file *file, poll_table *wait)
1268 {
1269         struct perf_counter *counter = file->private_data;
1270         struct perf_mmap_data *data;
1271         unsigned int events;
1272
1273         rcu_read_lock();
1274         data = rcu_dereference(counter->data);
1275         if (data)
1276                 events = atomic_xchg(&data->wakeup, 0);
1277         else
1278                 events = POLL_HUP;
1279         rcu_read_unlock();
1280
1281         poll_wait(file, &counter->waitq, wait);
1282
1283         return events;
1284 }
1285
1286 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1287 {
1288         struct perf_counter *counter = file->private_data;
1289         int err = 0;
1290
1291         switch (cmd) {
1292         case PERF_COUNTER_IOC_ENABLE:
1293                 perf_counter_enable_family(counter);
1294                 break;
1295         case PERF_COUNTER_IOC_DISABLE:
1296                 perf_counter_disable_family(counter);
1297                 break;
1298         case PERF_COUNTER_IOC_REFRESH:
1299                 perf_counter_refresh(counter, arg);
1300                 break;
1301         default:
1302                 err = -ENOTTY;
1303         }
1304         return err;
1305 }
1306
1307 /*
1308  * Callers need to ensure there can be no nesting of this function, otherwise
1309  * the seqlock logic goes bad. We can not serialize this because the arch
1310  * code calls this from NMI context.
1311  */
1312 void perf_counter_update_userpage(struct perf_counter *counter)
1313 {
1314         struct perf_mmap_data *data;
1315         struct perf_counter_mmap_page *userpg;
1316
1317         rcu_read_lock();
1318         data = rcu_dereference(counter->data);
1319         if (!data)
1320                 goto unlock;
1321
1322         userpg = data->user_page;
1323
1324         /*
1325          * Disable preemption so as to not let the corresponding user-space
1326          * spin too long if we get preempted.
1327          */
1328         preempt_disable();
1329         ++userpg->lock;
1330         barrier();
1331         userpg->index = counter->hw.idx;
1332         userpg->offset = atomic64_read(&counter->count);
1333         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1334                 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1335
1336         barrier();
1337         ++userpg->lock;
1338         preempt_enable();
1339 unlock:
1340         rcu_read_unlock();
1341 }
1342
1343 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1344 {
1345         struct perf_counter *counter = vma->vm_file->private_data;
1346         struct perf_mmap_data *data;
1347         int ret = VM_FAULT_SIGBUS;
1348
1349         rcu_read_lock();
1350         data = rcu_dereference(counter->data);
1351         if (!data)
1352                 goto unlock;
1353
1354         if (vmf->pgoff == 0) {
1355                 vmf->page = virt_to_page(data->user_page);
1356         } else {
1357                 int nr = vmf->pgoff - 1;
1358
1359                 if ((unsigned)nr > data->nr_pages)
1360                         goto unlock;
1361
1362                 vmf->page = virt_to_page(data->data_pages[nr]);
1363         }
1364         get_page(vmf->page);
1365         ret = 0;
1366 unlock:
1367         rcu_read_unlock();
1368
1369         return ret;
1370 }
1371
1372 static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
1373 {
1374         struct perf_mmap_data *data;
1375         unsigned long size;
1376         int i;
1377
1378         WARN_ON(atomic_read(&counter->mmap_count));
1379
1380         size = sizeof(struct perf_mmap_data);
1381         size += nr_pages * sizeof(void *);
1382
1383         data = kzalloc(size, GFP_KERNEL);
1384         if (!data)
1385                 goto fail;
1386
1387         data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
1388         if (!data->user_page)
1389                 goto fail_user_page;
1390
1391         for (i = 0; i < nr_pages; i++) {
1392                 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
1393                 if (!data->data_pages[i])
1394                         goto fail_data_pages;
1395         }
1396
1397         data->nr_pages = nr_pages;
1398
1399         rcu_assign_pointer(counter->data, data);
1400
1401         return 0;
1402
1403 fail_data_pages:
1404         for (i--; i >= 0; i--)
1405                 free_page((unsigned long)data->data_pages[i]);
1406
1407         free_page((unsigned long)data->user_page);
1408
1409 fail_user_page:
1410         kfree(data);
1411
1412 fail:
1413         return -ENOMEM;
1414 }
1415
1416 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
1417 {
1418         struct perf_mmap_data *data = container_of(rcu_head,
1419                         struct perf_mmap_data, rcu_head);
1420         int i;
1421
1422         free_page((unsigned long)data->user_page);
1423         for (i = 0; i < data->nr_pages; i++)
1424                 free_page((unsigned long)data->data_pages[i]);
1425         kfree(data);
1426 }
1427
1428 static void perf_mmap_data_free(struct perf_counter *counter)
1429 {
1430         struct perf_mmap_data *data = counter->data;
1431
1432         WARN_ON(atomic_read(&counter->mmap_count));
1433
1434         rcu_assign_pointer(counter->data, NULL);
1435         call_rcu(&data->rcu_head, __perf_mmap_data_free);
1436 }
1437
1438 static void perf_mmap_open(struct vm_area_struct *vma)
1439 {
1440         struct perf_counter *counter = vma->vm_file->private_data;
1441
1442         atomic_inc(&counter->mmap_count);
1443 }
1444
1445 static void perf_mmap_close(struct vm_area_struct *vma)
1446 {
1447         struct perf_counter *counter = vma->vm_file->private_data;
1448
1449         if (atomic_dec_and_mutex_lock(&counter->mmap_count,
1450                                       &counter->mmap_mutex)) {
1451                 vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
1452                 perf_mmap_data_free(counter);
1453                 mutex_unlock(&counter->mmap_mutex);
1454         }
1455 }
1456
1457 static struct vm_operations_struct perf_mmap_vmops = {
1458         .open  = perf_mmap_open,
1459         .close = perf_mmap_close,
1460         .fault = perf_mmap_fault,
1461 };
1462
1463 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1464 {
1465         struct perf_counter *counter = file->private_data;
1466         unsigned long vma_size;
1467         unsigned long nr_pages;
1468         unsigned long locked, lock_limit;
1469         int ret = 0;
1470
1471         if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1472                 return -EINVAL;
1473
1474         vma_size = vma->vm_end - vma->vm_start;
1475         nr_pages = (vma_size / PAGE_SIZE) - 1;
1476
1477         /*
1478          * If we have data pages ensure they're a power-of-two number, so we
1479          * can do bitmasks instead of modulo.
1480          */
1481         if (nr_pages != 0 && !is_power_of_2(nr_pages))
1482                 return -EINVAL;
1483
1484         if (vma_size != PAGE_SIZE * (1 + nr_pages))
1485                 return -EINVAL;
1486
1487         if (vma->vm_pgoff != 0)
1488                 return -EINVAL;
1489
1490         mutex_lock(&counter->mmap_mutex);
1491         if (atomic_inc_not_zero(&counter->mmap_count)) {
1492                 if (nr_pages != counter->data->nr_pages)
1493                         ret = -EINVAL;
1494                 goto unlock;
1495         }
1496
1497         locked = vma->vm_mm->locked_vm;
1498         locked += nr_pages + 1;
1499
1500         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
1501         lock_limit >>= PAGE_SHIFT;
1502
1503         if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
1504                 ret = -EPERM;
1505                 goto unlock;
1506         }
1507
1508         WARN_ON(counter->data);
1509         ret = perf_mmap_data_alloc(counter, nr_pages);
1510         if (ret)
1511                 goto unlock;
1512
1513         atomic_set(&counter->mmap_count, 1);
1514         vma->vm_mm->locked_vm += nr_pages + 1;
1515 unlock:
1516         mutex_unlock(&counter->mmap_mutex);
1517
1518         vma->vm_flags &= ~VM_MAYWRITE;
1519         vma->vm_flags |= VM_RESERVED;
1520         vma->vm_ops = &perf_mmap_vmops;
1521
1522         return ret;
1523 }
1524
1525 static int perf_fasync(int fd, struct file *filp, int on)
1526 {
1527         struct perf_counter *counter = filp->private_data;
1528         struct inode *inode = filp->f_path.dentry->d_inode;
1529         int retval;
1530
1531         mutex_lock(&inode->i_mutex);
1532         retval = fasync_helper(fd, filp, on, &counter->fasync);
1533         mutex_unlock(&inode->i_mutex);
1534
1535         if (retval < 0)
1536                 return retval;
1537
1538         return 0;
1539 }
1540
1541 static const struct file_operations perf_fops = {
1542         .release                = perf_release,
1543         .read                   = perf_read,
1544         .poll                   = perf_poll,
1545         .unlocked_ioctl         = perf_ioctl,
1546         .compat_ioctl           = perf_ioctl,
1547         .mmap                   = perf_mmap,
1548         .fasync                 = perf_fasync,
1549 };
1550
1551 /*
1552  * Perf counter wakeup
1553  *
1554  * If there's data, ensure we set the poll() state and publish everything
1555  * to user-space before waking everybody up.
1556  */
1557
1558 void perf_counter_wakeup(struct perf_counter *counter)
1559 {
1560         struct perf_mmap_data *data;
1561
1562         rcu_read_lock();
1563         data = rcu_dereference(counter->data);
1564         if (data) {
1565                 atomic_set(&data->wakeup, POLL_IN);
1566                 /*
1567                  * Ensure all data writes are issued before updating the
1568                  * user-space data head information. The matching rmb()
1569                  * will be in userspace after reading this value.
1570                  */
1571                 smp_wmb();
1572                 data->user_page->data_head = atomic_read(&data->head);
1573         }
1574         rcu_read_unlock();
1575
1576         wake_up_all(&counter->waitq);
1577
1578         if (counter->pending_kill) {
1579                 kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
1580                 counter->pending_kill = 0;
1581         }
1582 }
1583
1584 /*
1585  * Pending wakeups
1586  *
1587  * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
1588  *
1589  * The NMI bit means we cannot possibly take locks. Therefore, maintain a
1590  * single linked list and use cmpxchg() to add entries lockless.
1591  */
1592
1593 static void perf_pending_counter(struct perf_pending_entry *entry)
1594 {
1595         struct perf_counter *counter = container_of(entry,
1596                         struct perf_counter, pending);
1597
1598         if (counter->pending_disable) {
1599                 counter->pending_disable = 0;
1600                 perf_counter_disable(counter);
1601         }
1602
1603         if (counter->pending_wakeup) {
1604                 counter->pending_wakeup = 0;
1605                 perf_counter_wakeup(counter);
1606         }
1607 }
1608
1609 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
1610
1611 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
1612         PENDING_TAIL,
1613 };
1614
1615 static void perf_pending_queue(struct perf_pending_entry *entry,
1616                                void (*func)(struct perf_pending_entry *))
1617 {
1618         struct perf_pending_entry **head;
1619
1620         if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
1621                 return;
1622
1623         entry->func = func;
1624
1625         head = &get_cpu_var(perf_pending_head);
1626
1627         do {
1628                 entry->next = *head;
1629         } while (cmpxchg(head, entry->next, entry) != entry->next);
1630
1631         set_perf_counter_pending();
1632
1633         put_cpu_var(perf_pending_head);
1634 }
1635
1636 static int __perf_pending_run(void)
1637 {
1638         struct perf_pending_entry *list;
1639         int nr = 0;
1640
1641         list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
1642         while (list != PENDING_TAIL) {
1643                 void (*func)(struct perf_pending_entry *);
1644                 struct perf_pending_entry *entry = list;
1645
1646                 list = list->next;
1647
1648                 func = entry->func;
1649                 entry->next = NULL;
1650                 /*
1651                  * Ensure we observe the unqueue before we issue the wakeup,
1652                  * so that we won't be waiting forever.
1653                  * -- see perf_not_pending().
1654                  */
1655                 smp_wmb();
1656
1657                 func(entry);
1658                 nr++;
1659         }
1660
1661         return nr;
1662 }
1663
1664 static inline int perf_not_pending(struct perf_counter *counter)
1665 {
1666         /*
1667          * If we flush on whatever cpu we run, there is a chance we don't
1668          * need to wait.
1669          */
1670         get_cpu();
1671         __perf_pending_run();
1672         put_cpu();
1673
1674         /*
1675          * Ensure we see the proper queue state before going to sleep
1676          * so that we do not miss the wakeup. -- see perf_pending_handle()
1677          */
1678         smp_rmb();
1679         return counter->pending.next == NULL;
1680 }
1681
1682 static void perf_pending_sync(struct perf_counter *counter)
1683 {
1684         wait_event(counter->waitq, perf_not_pending(counter));
1685 }
1686
1687 void perf_counter_do_pending(void)
1688 {
1689         __perf_pending_run();
1690 }
1691
1692 /*
1693  * Callchain support -- arch specific
1694  */
1695
1696 __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1697 {
1698         return NULL;
1699 }
1700
1701 /*
1702  * Output
1703  */
1704
1705 struct perf_output_handle {
1706         struct perf_counter     *counter;
1707         struct perf_mmap_data   *data;
1708         unsigned int            offset;
1709         unsigned int            head;
1710         int                     wakeup;
1711         int                     nmi;
1712         int                     overflow;
1713 };
1714
1715 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
1716 {
1717         if (handle->nmi) {
1718                 handle->counter->pending_wakeup = 1;
1719                 perf_pending_queue(&handle->counter->pending,
1720                                    perf_pending_counter);
1721         } else
1722                 perf_counter_wakeup(handle->counter);
1723 }
1724
1725 static int perf_output_begin(struct perf_output_handle *handle,
1726                              struct perf_counter *counter, unsigned int size,
1727                              int nmi, int overflow)
1728 {
1729         struct perf_mmap_data *data;
1730         unsigned int offset, head;
1731
1732         rcu_read_lock();
1733         data = rcu_dereference(counter->data);
1734         if (!data)
1735                 goto out;
1736
1737         handle->counter  = counter;
1738         handle->nmi      = nmi;
1739         handle->overflow = overflow;
1740
1741         if (!data->nr_pages)
1742                 goto fail;
1743
1744         do {
1745                 offset = head = atomic_read(&data->head);
1746                 head += size;
1747         } while (atomic_cmpxchg(&data->head, offset, head) != offset);
1748
1749         handle->data    = data;
1750         handle->offset  = offset;
1751         handle->head    = head;
1752         handle->wakeup  = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
1753
1754         return 0;
1755
1756 fail:
1757         __perf_output_wakeup(handle);
1758 out:
1759         rcu_read_unlock();
1760
1761         return -ENOSPC;
1762 }
1763
1764 static void perf_output_copy(struct perf_output_handle *handle,
1765                              void *buf, unsigned int len)
1766 {
1767         unsigned int pages_mask;
1768         unsigned int offset;
1769         unsigned int size;
1770         void **pages;
1771
1772         offset          = handle->offset;
1773         pages_mask      = handle->data->nr_pages - 1;
1774         pages           = handle->data->data_pages;
1775
1776         do {
1777                 unsigned int page_offset;
1778                 int nr;
1779
1780                 nr          = (offset >> PAGE_SHIFT) & pages_mask;
1781                 page_offset = offset & (PAGE_SIZE - 1);
1782                 size        = min_t(unsigned int, PAGE_SIZE - page_offset, len);
1783
1784                 memcpy(pages[nr] + page_offset, buf, size);
1785
1786                 len         -= size;
1787                 buf         += size;
1788                 offset      += size;
1789         } while (len);
1790
1791         handle->offset = offset;
1792
1793         WARN_ON_ONCE(handle->offset > handle->head);
1794 }
1795
1796 #define perf_output_put(handle, x) \
1797         perf_output_copy((handle), &(x), sizeof(x))
1798
1799 static void perf_output_end(struct perf_output_handle *handle)
1800 {
1801         int wakeup_events = handle->counter->hw_event.wakeup_events;
1802
1803         if (handle->overflow && wakeup_events) {
1804                 int events = atomic_inc_return(&handle->data->events);
1805                 if (events >= wakeup_events) {
1806                         atomic_sub(wakeup_events, &handle->data->events);
1807                         __perf_output_wakeup(handle);
1808                 }
1809         } else if (handle->wakeup)
1810                 __perf_output_wakeup(handle);
1811         rcu_read_unlock();
1812 }
1813
1814 static void perf_counter_output(struct perf_counter *counter,
1815                                 int nmi, struct pt_regs *regs)
1816 {
1817         int ret;
1818         u64 record_type = counter->hw_event.record_type;
1819         struct perf_output_handle handle;
1820         struct perf_event_header header;
1821         u64 ip;
1822         struct {
1823                 u32 pid, tid;
1824         } tid_entry;
1825         struct {
1826                 u64 event;
1827                 u64 counter;
1828         } group_entry;
1829         struct perf_callchain_entry *callchain = NULL;
1830         int callchain_size = 0;
1831         u64 time;
1832
1833         header.type = PERF_EVENT_COUNTER_OVERFLOW;
1834         header.size = sizeof(header);
1835
1836         if (record_type & PERF_RECORD_IP) {
1837                 ip = instruction_pointer(regs);
1838                 header.type |= __PERF_EVENT_IP;
1839                 header.size += sizeof(ip);
1840         }
1841
1842         if (record_type & PERF_RECORD_TID) {
1843                 /* namespace issues */
1844                 tid_entry.pid = current->group_leader->pid;
1845                 tid_entry.tid = current->pid;
1846
1847                 header.type |= __PERF_EVENT_TID;
1848                 header.size += sizeof(tid_entry);
1849         }
1850
1851         if (record_type & PERF_RECORD_GROUP) {
1852                 header.type |= __PERF_EVENT_GROUP;
1853                 header.size += sizeof(u64) +
1854                         counter->nr_siblings * sizeof(group_entry);
1855         }
1856
1857         if (record_type & PERF_RECORD_CALLCHAIN) {
1858                 callchain = perf_callchain(regs);
1859
1860                 if (callchain) {
1861                         callchain_size = (1 + callchain->nr) * sizeof(u64);
1862
1863                         header.type |= __PERF_EVENT_CALLCHAIN;
1864                         header.size += callchain_size;
1865                 }
1866         }
1867
1868         if (record_type & PERF_RECORD_TIME) {
1869                 /*
1870                  * Maybe do better on x86 and provide cpu_clock_nmi()
1871                  */
1872                 time = sched_clock();
1873
1874                 header.type |= __PERF_EVENT_TIME;
1875                 header.size += sizeof(u64);
1876         }
1877
1878         ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
1879         if (ret)
1880                 return;
1881
1882         perf_output_put(&handle, header);
1883
1884         if (record_type & PERF_RECORD_IP)
1885                 perf_output_put(&handle, ip);
1886
1887         if (record_type & PERF_RECORD_TID)
1888                 perf_output_put(&handle, tid_entry);
1889
1890         if (record_type & PERF_RECORD_GROUP) {
1891                 struct perf_counter *leader, *sub;
1892                 u64 nr = counter->nr_siblings;
1893
1894                 perf_output_put(&handle, nr);
1895
1896                 leader = counter->group_leader;
1897                 list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1898                         if (sub != counter)
1899                                 sub->hw_ops->read(sub);
1900
1901                         group_entry.event = sub->hw_event.config;
1902                         group_entry.counter = atomic64_read(&sub->count);
1903
1904                         perf_output_put(&handle, group_entry);
1905                 }
1906         }
1907
1908         if (callchain)
1909                 perf_output_copy(&handle, callchain, callchain_size);
1910
1911         if (record_type & PERF_RECORD_TIME)
1912                 perf_output_put(&handle, time);
1913
1914         perf_output_end(&handle);
1915 }
1916
1917 /*
1918  * mmap tracking
1919  */
1920
1921 struct perf_mmap_event {
1922         struct file     *file;
1923         char            *file_name;
1924         int             file_size;
1925
1926         struct {
1927                 struct perf_event_header        header;
1928
1929                 u32                             pid;
1930                 u32                             tid;
1931                 u64                             start;
1932                 u64                             len;
1933                 u64                             pgoff;
1934         } event;
1935 };
1936
1937 static void perf_counter_mmap_output(struct perf_counter *counter,
1938                                      struct perf_mmap_event *mmap_event)
1939 {
1940         struct perf_output_handle handle;
1941         int size = mmap_event->event.header.size;
1942         int ret = perf_output_begin(&handle, counter, size, 0, 0);
1943
1944         if (ret)
1945                 return;
1946
1947         perf_output_put(&handle, mmap_event->event);
1948         perf_output_copy(&handle, mmap_event->file_name,
1949                                    mmap_event->file_size);
1950         perf_output_end(&handle);
1951 }
1952
1953 static int perf_counter_mmap_match(struct perf_counter *counter,
1954                                    struct perf_mmap_event *mmap_event)
1955 {
1956         if (counter->hw_event.mmap &&
1957             mmap_event->event.header.type == PERF_EVENT_MMAP)
1958                 return 1;
1959
1960         if (counter->hw_event.munmap &&
1961             mmap_event->event.header.type == PERF_EVENT_MUNMAP)
1962                 return 1;
1963
1964         return 0;
1965 }
1966
1967 static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
1968                                   struct perf_mmap_event *mmap_event)
1969 {
1970         struct perf_counter *counter;
1971
1972         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1973                 return;
1974
1975         rcu_read_lock();
1976         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1977                 if (perf_counter_mmap_match(counter, mmap_event))
1978                         perf_counter_mmap_output(counter, mmap_event);
1979         }
1980         rcu_read_unlock();
1981 }
1982
1983 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
1984 {
1985         struct perf_cpu_context *cpuctx;
1986         struct file *file = mmap_event->file;
1987         unsigned int size;
1988         char tmp[16];
1989         char *buf = NULL;
1990         char *name;
1991
1992         if (file) {
1993                 buf = kzalloc(PATH_MAX, GFP_KERNEL);
1994                 if (!buf) {
1995                         name = strncpy(tmp, "//enomem", sizeof(tmp));
1996                         goto got_name;
1997                 }
1998                 name = dentry_path(file->f_dentry, buf, PATH_MAX);
1999                 if (IS_ERR(name)) {
2000                         name = strncpy(tmp, "//toolong", sizeof(tmp));
2001                         goto got_name;
2002                 }
2003         } else {
2004                 name = strncpy(tmp, "//anon", sizeof(tmp));
2005                 goto got_name;
2006         }
2007
2008 got_name:
2009         size = ALIGN(strlen(name), sizeof(u64));
2010
2011         mmap_event->file_name = name;
2012         mmap_event->file_size = size;
2013
2014         mmap_event->event.header.size = sizeof(mmap_event->event) + size;
2015
2016         cpuctx = &get_cpu_var(perf_cpu_context);
2017         perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
2018         put_cpu_var(perf_cpu_context);
2019
2020         perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
2021
2022         kfree(buf);
2023 }
2024
2025 void perf_counter_mmap(unsigned long addr, unsigned long len,
2026                        unsigned long pgoff, struct file *file)
2027 {
2028         struct perf_mmap_event mmap_event = {
2029                 .file   = file,
2030                 .event  = {
2031                         .header = { .type = PERF_EVENT_MMAP, },
2032                         .pid    = current->group_leader->pid,
2033                         .tid    = current->pid,
2034                         .start  = addr,
2035                         .len    = len,
2036                         .pgoff  = pgoff,
2037                 },
2038         };
2039
2040         perf_counter_mmap_event(&mmap_event);
2041 }
2042
2043 void perf_counter_munmap(unsigned long addr, unsigned long len,
2044                          unsigned long pgoff, struct file *file)
2045 {
2046         struct perf_mmap_event mmap_event = {
2047                 .file   = file,
2048                 .event  = {
2049                         .header = { .type = PERF_EVENT_MUNMAP, },
2050                         .pid    = current->group_leader->pid,
2051                         .tid    = current->pid,
2052                         .start  = addr,
2053                         .len    = len,
2054                         .pgoff  = pgoff,
2055                 },
2056         };
2057
2058         perf_counter_mmap_event(&mmap_event);
2059 }
2060
2061 /*
2062  * Generic counter overflow handling.
2063  */
2064
2065 int perf_counter_overflow(struct perf_counter *counter,
2066                           int nmi, struct pt_regs *regs)
2067 {
2068         int events = atomic_read(&counter->event_limit);
2069         int ret = 0;
2070
2071         counter->pending_kill = POLL_IN;
2072         if (events && atomic_dec_and_test(&counter->event_limit)) {
2073                 ret = 1;
2074                 counter->pending_kill = POLL_HUP;
2075                 if (nmi) {
2076                         counter->pending_disable = 1;
2077                         perf_pending_queue(&counter->pending,
2078                                            perf_pending_counter);
2079                 } else
2080                         perf_counter_disable(counter);
2081         }
2082
2083         perf_counter_output(counter, nmi, regs);
2084         return ret;
2085 }
2086
2087 /*
2088  * Generic software counter infrastructure
2089  */
2090
2091 static void perf_swcounter_update(struct perf_counter *counter)
2092 {
2093         struct hw_perf_counter *hwc = &counter->hw;
2094         u64 prev, now;
2095         s64 delta;
2096
2097 again:
2098         prev = atomic64_read(&hwc->prev_count);
2099         now = atomic64_read(&hwc->count);
2100         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
2101                 goto again;
2102
2103         delta = now - prev;
2104
2105         atomic64_add(delta, &counter->count);
2106         atomic64_sub(delta, &hwc->period_left);
2107 }
2108
2109 static void perf_swcounter_set_period(struct perf_counter *counter)
2110 {
2111         struct hw_perf_counter *hwc = &counter->hw;
2112         s64 left = atomic64_read(&hwc->period_left);
2113         s64 period = hwc->irq_period;
2114
2115         if (unlikely(left <= -period)) {
2116                 left = period;
2117                 atomic64_set(&hwc->period_left, left);
2118         }
2119
2120         if (unlikely(left <= 0)) {
2121                 left += period;
2122                 atomic64_add(period, &hwc->period_left);
2123         }
2124
2125         atomic64_set(&hwc->prev_count, -left);
2126         atomic64_set(&hwc->count, -left);
2127 }
2128
2129 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
2130 {
2131         enum hrtimer_restart ret = HRTIMER_RESTART;
2132         struct perf_counter *counter;
2133         struct pt_regs *regs;
2134
2135         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
2136         counter->hw_ops->read(counter);
2137
2138         regs = get_irq_regs();
2139         /*
2140          * In case we exclude kernel IPs or are somehow not in interrupt
2141          * context, provide the next best thing, the user IP.
2142          */
2143         if ((counter->hw_event.exclude_kernel || !regs) &&
2144                         !counter->hw_event.exclude_user)
2145                 regs = task_pt_regs(current);
2146
2147         if (regs) {
2148                 if (perf_counter_overflow(counter, 0, regs))
2149                         ret = HRTIMER_NORESTART;
2150         }
2151
2152         hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
2153
2154         return ret;
2155 }
2156
2157 static void perf_swcounter_overflow(struct perf_counter *counter,
2158                                     int nmi, struct pt_regs *regs)
2159 {
2160         perf_swcounter_update(counter);
2161         perf_swcounter_set_period(counter);
2162         if (perf_counter_overflow(counter, nmi, regs))
2163                 /* soft-disable the counter */
2164                 ;
2165
2166 }
2167
2168 static int perf_swcounter_match(struct perf_counter *counter,
2169                                 enum perf_event_types type,
2170                                 u32 event, struct pt_regs *regs)
2171 {
2172         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
2173                 return 0;
2174
2175         if (perf_event_raw(&counter->hw_event))
2176                 return 0;
2177
2178         if (perf_event_type(&counter->hw_event) != type)
2179                 return 0;
2180
2181         if (perf_event_id(&counter->hw_event) != event)
2182                 return 0;
2183
2184         if (counter->hw_event.exclude_user && user_mode(regs))
2185                 return 0;
2186
2187         if (counter->hw_event.exclude_kernel && !user_mode(regs))
2188                 return 0;
2189
2190         return 1;
2191 }
2192
2193 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
2194                                int nmi, struct pt_regs *regs)
2195 {
2196         int neg = atomic64_add_negative(nr, &counter->hw.count);
2197         if (counter->hw.irq_period && !neg)
2198                 perf_swcounter_overflow(counter, nmi, regs);
2199 }
2200
2201 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
2202                                      enum perf_event_types type, u32 event,
2203                                      u64 nr, int nmi, struct pt_regs *regs)
2204 {
2205         struct perf_counter *counter;
2206
2207         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
2208                 return;
2209
2210         rcu_read_lock();
2211         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
2212                 if (perf_swcounter_match(counter, type, event, regs))
2213                         perf_swcounter_add(counter, nr, nmi, regs);
2214         }
2215         rcu_read_unlock();
2216 }
2217
2218 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
2219 {
2220         if (in_nmi())
2221                 return &cpuctx->recursion[3];
2222
2223         if (in_irq())
2224                 return &cpuctx->recursion[2];
2225
2226         if (in_softirq())
2227                 return &cpuctx->recursion[1];
2228
2229         return &cpuctx->recursion[0];
2230 }
2231
2232 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
2233                                    u64 nr, int nmi, struct pt_regs *regs)
2234 {
2235         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
2236         int *recursion = perf_swcounter_recursion_context(cpuctx);
2237
2238         if (*recursion)
2239                 goto out;
2240
2241         (*recursion)++;
2242         barrier();
2243
2244         perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
2245         if (cpuctx->task_ctx) {
2246                 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
2247                                 nr, nmi, regs);
2248         }
2249
2250         barrier();
2251         (*recursion)--;
2252
2253 out:
2254         put_cpu_var(perf_cpu_context);
2255 }
2256
2257 void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
2258 {
2259         __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
2260 }
2261
2262 static void perf_swcounter_read(struct perf_counter *counter)
2263 {
2264         perf_swcounter_update(counter);
2265 }
2266
2267 static int perf_swcounter_enable(struct perf_counter *counter)
2268 {
2269         perf_swcounter_set_period(counter);
2270         return 0;
2271 }
2272
2273 static void perf_swcounter_disable(struct perf_counter *counter)
2274 {
2275         perf_swcounter_update(counter);
2276 }
2277
2278 static const struct hw_perf_counter_ops perf_ops_generic = {
2279         .enable         = perf_swcounter_enable,
2280         .disable        = perf_swcounter_disable,
2281         .read           = perf_swcounter_read,
2282 };
2283
2284 /*
2285  * Software counter: cpu wall time clock
2286  */
2287
2288 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
2289 {
2290         int cpu = raw_smp_processor_id();
2291         s64 prev;
2292         u64 now;
2293
2294         now = cpu_clock(cpu);
2295         prev = atomic64_read(&counter->hw.prev_count);
2296         atomic64_set(&counter->hw.prev_count, now);
2297         atomic64_add(now - prev, &counter->count);
2298 }
2299
2300 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
2301 {
2302         struct hw_perf_counter *hwc = &counter->hw;
2303         int cpu = raw_smp_processor_id();
2304
2305         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
2306         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2307         hwc->hrtimer.function = perf_swcounter_hrtimer;
2308         if (hwc->irq_period) {
2309                 __hrtimer_start_range_ns(&hwc->hrtimer,
2310                                 ns_to_ktime(hwc->irq_period), 0,
2311                                 HRTIMER_MODE_REL, 0);
2312         }
2313
2314         return 0;
2315 }
2316
2317 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
2318 {
2319         hrtimer_cancel(&counter->hw.hrtimer);
2320         cpu_clock_perf_counter_update(counter);
2321 }
2322
2323 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
2324 {
2325         cpu_clock_perf_counter_update(counter);
2326 }
2327
2328 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
2329         .enable         = cpu_clock_perf_counter_enable,
2330         .disable        = cpu_clock_perf_counter_disable,
2331         .read           = cpu_clock_perf_counter_read,
2332 };
2333
2334 /*
2335  * Software counter: task time clock
2336  */
2337
2338 static void task_clock_perf_counter_update(struct perf_counter *counter)
2339 {
2340         u64 prev, now;
2341         s64 delta;
2342
2343         now = counter->ctx->time;
2344
2345         prev = atomic64_xchg(&counter->hw.prev_count, now);
2346         delta = now - prev;
2347         atomic64_add(delta, &counter->count);
2348 }
2349
2350 static int task_clock_perf_counter_enable(struct perf_counter *counter)
2351 {
2352         struct hw_perf_counter *hwc = &counter->hw;
2353         u64 now;
2354
2355         now = counter->ctx->time;
2356
2357         atomic64_set(&hwc->prev_count, now);
2358         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2359         hwc->hrtimer.function = perf_swcounter_hrtimer;
2360         if (hwc->irq_period) {
2361                 __hrtimer_start_range_ns(&hwc->hrtimer,
2362                                 ns_to_ktime(hwc->irq_period), 0,
2363                                 HRTIMER_MODE_REL, 0);
2364         }
2365
2366         return 0;
2367 }
2368
2369 static void task_clock_perf_counter_disable(struct perf_counter *counter)
2370 {
2371         hrtimer_cancel(&counter->hw.hrtimer);
2372         task_clock_perf_counter_update(counter);
2373 }
2374
2375 static void task_clock_perf_counter_read(struct perf_counter *counter)
2376 {
2377         update_context_time(counter->ctx);
2378         task_clock_perf_counter_update(counter);
2379 }
2380
2381 static const struct hw_perf_counter_ops perf_ops_task_clock = {
2382         .enable         = task_clock_perf_counter_enable,
2383         .disable        = task_clock_perf_counter_disable,
2384         .read           = task_clock_perf_counter_read,
2385 };
2386
2387 /*
2388  * Software counter: cpu migrations
2389  */
2390
2391 static inline u64 get_cpu_migrations(struct perf_counter *counter)
2392 {
2393         struct task_struct *curr = counter->ctx->task;
2394
2395         if (curr)
2396                 return curr->se.nr_migrations;
2397         return cpu_nr_migrations(smp_processor_id());
2398 }
2399
2400 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
2401 {
2402         u64 prev, now;
2403         s64 delta;
2404
2405         prev = atomic64_read(&counter->hw.prev_count);
2406         now = get_cpu_migrations(counter);
2407
2408         atomic64_set(&counter->hw.prev_count, now);
2409
2410         delta = now - prev;
2411
2412         atomic64_add(delta, &counter->count);
2413 }
2414
2415 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
2416 {
2417         cpu_migrations_perf_counter_update(counter);
2418 }
2419
2420 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
2421 {
2422         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
2423                 atomic64_set(&counter->hw.prev_count,
2424                              get_cpu_migrations(counter));
2425         return 0;
2426 }
2427
2428 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
2429 {
2430         cpu_migrations_perf_counter_update(counter);
2431 }
2432
2433 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
2434         .enable         = cpu_migrations_perf_counter_enable,
2435         .disable        = cpu_migrations_perf_counter_disable,
2436         .read           = cpu_migrations_perf_counter_read,
2437 };
2438
2439 #ifdef CONFIG_EVENT_PROFILE
2440 void perf_tpcounter_event(int event_id)
2441 {
2442         struct pt_regs *regs = get_irq_regs();
2443
2444         if (!regs)
2445                 regs = task_pt_regs(current);
2446
2447         __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
2448 }
2449
2450 extern int ftrace_profile_enable(int);
2451 extern void ftrace_profile_disable(int);
2452
2453 static void tp_perf_counter_destroy(struct perf_counter *counter)
2454 {
2455         ftrace_profile_disable(perf_event_id(&counter->hw_event));
2456 }
2457
2458 static const struct hw_perf_counter_ops *
2459 tp_perf_counter_init(struct perf_counter *counter)
2460 {
2461         int event_id = perf_event_id(&counter->hw_event);
2462         int ret;
2463
2464         ret = ftrace_profile_enable(event_id);
2465         if (ret)
2466                 return NULL;
2467
2468         counter->destroy = tp_perf_counter_destroy;
2469         counter->hw.irq_period = counter->hw_event.irq_period;
2470
2471         return &perf_ops_generic;
2472 }
2473 #else
2474 static const struct hw_perf_counter_ops *
2475 tp_perf_counter_init(struct perf_counter *counter)
2476 {
2477         return NULL;
2478 }
2479 #endif
2480
2481 static const struct hw_perf_counter_ops *
2482 sw_perf_counter_init(struct perf_counter *counter)
2483 {
2484         struct perf_counter_hw_event *hw_event = &counter->hw_event;
2485         const struct hw_perf_counter_ops *hw_ops = NULL;
2486         struct hw_perf_counter *hwc = &counter->hw;
2487
2488         /*
2489          * Software counters (currently) can't in general distinguish
2490          * between user, kernel and hypervisor events.
2491          * However, context switches and cpu migrations are considered
2492          * to be kernel events, and page faults are never hypervisor
2493          * events.
2494          */
2495         switch (perf_event_id(&counter->hw_event)) {
2496         case PERF_COUNT_CPU_CLOCK:
2497                 hw_ops = &perf_ops_cpu_clock;
2498
2499                 if (hw_event->irq_period && hw_event->irq_period < 10000)
2500                         hw_event->irq_period = 10000;
2501                 break;
2502         case PERF_COUNT_TASK_CLOCK:
2503                 /*
2504                  * If the user instantiates this as a per-cpu counter,
2505                  * use the cpu_clock counter instead.
2506                  */
2507                 if (counter->ctx->task)
2508                         hw_ops = &perf_ops_task_clock;
2509                 else
2510                         hw_ops = &perf_ops_cpu_clock;
2511
2512                 if (hw_event->irq_period && hw_event->irq_period < 10000)
2513                         hw_event->irq_period = 10000;
2514                 break;
2515         case PERF_COUNT_PAGE_FAULTS:
2516         case PERF_COUNT_PAGE_FAULTS_MIN:
2517         case PERF_COUNT_PAGE_FAULTS_MAJ:
2518         case PERF_COUNT_CONTEXT_SWITCHES:
2519                 hw_ops = &perf_ops_generic;
2520                 break;
2521         case PERF_COUNT_CPU_MIGRATIONS:
2522                 if (!counter->hw_event.exclude_kernel)
2523                         hw_ops = &perf_ops_cpu_migrations;
2524                 break;
2525         }
2526
2527         if (hw_ops)
2528                 hwc->irq_period = hw_event->irq_period;
2529
2530         return hw_ops;
2531 }
2532
2533 /*
2534  * Allocate and initialize a counter structure
2535  */
2536 static struct perf_counter *
2537 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
2538                    int cpu,
2539                    struct perf_counter_context *ctx,
2540                    struct perf_counter *group_leader,
2541                    gfp_t gfpflags)
2542 {
2543         const struct hw_perf_counter_ops *hw_ops;
2544         struct perf_counter *counter;
2545         long err;
2546
2547         counter = kzalloc(sizeof(*counter), gfpflags);
2548         if (!counter)
2549                 return ERR_PTR(-ENOMEM);
2550
2551         /*
2552          * Single counters are their own group leaders, with an
2553          * empty sibling list:
2554          */
2555         if (!group_leader)
2556                 group_leader = counter;
2557
2558         mutex_init(&counter->mutex);
2559         INIT_LIST_HEAD(&counter->list_entry);
2560         INIT_LIST_HEAD(&counter->event_entry);
2561         INIT_LIST_HEAD(&counter->sibling_list);
2562         init_waitqueue_head(&counter->waitq);
2563
2564         mutex_init(&counter->mmap_mutex);
2565
2566         INIT_LIST_HEAD(&counter->child_list);
2567
2568         counter->cpu                    = cpu;
2569         counter->hw_event               = *hw_event;
2570         counter->group_leader           = group_leader;
2571         counter->hw_ops                 = NULL;
2572         counter->ctx                    = ctx;
2573
2574         counter->state = PERF_COUNTER_STATE_INACTIVE;
2575         if (hw_event->disabled)
2576                 counter->state = PERF_COUNTER_STATE_OFF;
2577
2578         hw_ops = NULL;
2579
2580         if (perf_event_raw(hw_event)) {
2581                 hw_ops = hw_perf_counter_init(counter);
2582                 goto done;
2583         }
2584
2585         switch (perf_event_type(hw_event)) {
2586         case PERF_TYPE_HARDWARE:
2587                 hw_ops = hw_perf_counter_init(counter);
2588                 break;
2589
2590         case PERF_TYPE_SOFTWARE:
2591                 hw_ops = sw_perf_counter_init(counter);
2592                 break;
2593
2594         case PERF_TYPE_TRACEPOINT:
2595                 hw_ops = tp_perf_counter_init(counter);
2596                 break;
2597         }
2598 done:
2599         err = 0;
2600         if (!hw_ops)
2601                 err = -EINVAL;
2602         else if (IS_ERR(hw_ops))
2603                 err = PTR_ERR(hw_ops);
2604
2605         if (err) {
2606                 kfree(counter);
2607                 return ERR_PTR(err);
2608         }
2609
2610         counter->hw_ops = hw_ops;
2611
2612         return counter;
2613 }
2614
2615 /**
2616  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2617  *
2618  * @hw_event_uptr:      event type attributes for monitoring/sampling
2619  * @pid:                target pid
2620  * @cpu:                target cpu
2621  * @group_fd:           group leader counter fd
2622  */
2623 SYSCALL_DEFINE5(perf_counter_open,
2624                 const struct perf_counter_hw_event __user *, hw_event_uptr,
2625                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
2626 {
2627         struct perf_counter *counter, *group_leader;
2628         struct perf_counter_hw_event hw_event;
2629         struct perf_counter_context *ctx;
2630         struct file *counter_file = NULL;
2631         struct file *group_file = NULL;
2632         int fput_needed = 0;
2633         int fput_needed2 = 0;
2634         int ret;
2635
2636         /* for future expandability... */
2637         if (flags)
2638                 return -EINVAL;
2639
2640         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
2641                 return -EFAULT;
2642
2643         /*
2644          * Get the target context (task or percpu):
2645          */
2646         ctx = find_get_context(pid, cpu);
2647         if (IS_ERR(ctx))
2648                 return PTR_ERR(ctx);
2649
2650         /*
2651          * Look up the group leader (we will attach this counter to it):
2652          */
2653         group_leader = NULL;
2654         if (group_fd != -1) {
2655                 ret = -EINVAL;
2656                 group_file = fget_light(group_fd, &fput_needed);
2657                 if (!group_file)
2658                         goto err_put_context;
2659                 if (group_file->f_op != &perf_fops)
2660                         goto err_put_context;
2661
2662                 group_leader = group_file->private_data;
2663                 /*
2664                  * Do not allow a recursive hierarchy (this new sibling
2665                  * becoming part of another group-sibling):
2666                  */
2667                 if (group_leader->group_leader != group_leader)
2668                         goto err_put_context;
2669                 /*
2670                  * Do not allow to attach to a group in a different
2671                  * task or CPU context:
2672                  */
2673                 if (group_leader->ctx != ctx)
2674                         goto err_put_context;
2675                 /*
2676                  * Only a group leader can be exclusive or pinned
2677                  */
2678                 if (hw_event.exclusive || hw_event.pinned)
2679                         goto err_put_context;
2680         }
2681
2682         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2683                                      GFP_KERNEL);
2684         ret = PTR_ERR(counter);
2685         if (IS_ERR(counter))
2686                 goto err_put_context;
2687
2688         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2689         if (ret < 0)
2690                 goto err_free_put_context;
2691
2692         counter_file = fget_light(ret, &fput_needed2);
2693         if (!counter_file)
2694                 goto err_free_put_context;
2695
2696         counter->filp = counter_file;
2697         mutex_lock(&ctx->mutex);
2698         perf_install_in_context(ctx, counter, cpu);
2699         mutex_unlock(&ctx->mutex);
2700
2701         fput_light(counter_file, fput_needed2);
2702
2703 out_fput:
2704         fput_light(group_file, fput_needed);
2705
2706         return ret;
2707
2708 err_free_put_context:
2709         kfree(counter);
2710
2711 err_put_context:
2712         put_context(ctx);
2713
2714         goto out_fput;
2715 }
2716
2717 /*
2718  * Initialize the perf_counter context in a task_struct:
2719  */
2720 static void
2721 __perf_counter_init_context(struct perf_counter_context *ctx,
2722                             struct task_struct *task)
2723 {
2724         memset(ctx, 0, sizeof(*ctx));
2725         spin_lock_init(&ctx->lock);
2726         mutex_init(&ctx->mutex);
2727         INIT_LIST_HEAD(&ctx->counter_list);
2728         INIT_LIST_HEAD(&ctx->event_list);
2729         ctx->task = task;
2730 }
2731
2732 /*
2733  * inherit a counter from parent task to child task:
2734  */
2735 static struct perf_counter *
2736 inherit_counter(struct perf_counter *parent_counter,
2737               struct task_struct *parent,
2738               struct perf_counter_context *parent_ctx,
2739               struct task_struct *child,
2740               struct perf_counter *group_leader,
2741               struct perf_counter_context *child_ctx)
2742 {
2743         struct perf_counter *child_counter;
2744
2745         /*
2746          * Instead of creating recursive hierarchies of counters,
2747          * we link inherited counters back to the original parent,
2748          * which has a filp for sure, which we use as the reference
2749          * count:
2750          */
2751         if (parent_counter->parent)
2752                 parent_counter = parent_counter->parent;
2753
2754         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2755                                            parent_counter->cpu, child_ctx,
2756                                            group_leader, GFP_KERNEL);
2757         if (IS_ERR(child_counter))
2758                 return child_counter;
2759
2760         /*
2761          * Link it up in the child's context:
2762          */
2763         child_counter->task = child;
2764         add_counter_to_ctx(child_counter, child_ctx);
2765
2766         child_counter->parent = parent_counter;
2767         /*
2768          * inherit into child's child as well:
2769          */
2770         child_counter->hw_event.inherit = 1;
2771
2772         /*
2773          * Get a reference to the parent filp - we will fput it
2774          * when the child counter exits. This is safe to do because
2775          * we are in the parent and we know that the filp still
2776          * exists and has a nonzero count:
2777          */
2778         atomic_long_inc(&parent_counter->filp->f_count);
2779
2780         /*
2781          * Link this into the parent counter's child list
2782          */
2783         mutex_lock(&parent_counter->mutex);
2784         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2785
2786         /*
2787          * Make the child state follow the state of the parent counter,
2788          * not its hw_event.disabled bit.  We hold the parent's mutex,
2789          * so we won't race with perf_counter_{en,dis}able_family.
2790          */
2791         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2792                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2793         else
2794                 child_counter->state = PERF_COUNTER_STATE_OFF;
2795
2796         mutex_unlock(&parent_counter->mutex);
2797
2798         return child_counter;
2799 }
2800
2801 static int inherit_group(struct perf_counter *parent_counter,
2802               struct task_struct *parent,
2803               struct perf_counter_context *parent_ctx,
2804               struct task_struct *child,
2805               struct perf_counter_context *child_ctx)
2806 {
2807         struct perf_counter *leader;
2808         struct perf_counter *sub;
2809         struct perf_counter *child_ctr;
2810
2811         leader = inherit_counter(parent_counter, parent, parent_ctx,
2812                                  child, NULL, child_ctx);
2813         if (IS_ERR(leader))
2814                 return PTR_ERR(leader);
2815         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2816                 child_ctr = inherit_counter(sub, parent, parent_ctx,
2817                                             child, leader, child_ctx);
2818                 if (IS_ERR(child_ctr))
2819                         return PTR_ERR(child_ctr);
2820         }
2821         return 0;
2822 }
2823
2824 static void sync_child_counter(struct perf_counter *child_counter,
2825                                struct perf_counter *parent_counter)
2826 {
2827         u64 parent_val, child_val;
2828
2829         parent_val = atomic64_read(&parent_counter->count);
2830         child_val = atomic64_read(&child_counter->count);
2831
2832         /*
2833          * Add back the child's count to the parent's count:
2834          */
2835         atomic64_add(child_val, &parent_counter->count);
2836         atomic64_add(child_counter->total_time_enabled,
2837                      &parent_counter->child_total_time_enabled);
2838         atomic64_add(child_counter->total_time_running,
2839                      &parent_counter->child_total_time_running);
2840
2841         /*
2842          * Remove this counter from the parent's list
2843          */
2844         mutex_lock(&parent_counter->mutex);
2845         list_del_init(&child_counter->child_list);
2846         mutex_unlock(&parent_counter->mutex);
2847
2848         /*
2849          * Release the parent counter, if this was the last
2850          * reference to it.
2851          */
2852         fput(parent_counter->filp);
2853 }
2854
2855 static void
2856 __perf_counter_exit_task(struct task_struct *child,
2857                          struct perf_counter *child_counter,
2858                          struct perf_counter_context *child_ctx)
2859 {
2860         struct perf_counter *parent_counter;
2861         struct perf_counter *sub, *tmp;
2862
2863         /*
2864          * If we do not self-reap then we have to wait for the
2865          * child task to unschedule (it will happen for sure),
2866          * so that its counter is at its final count. (This
2867          * condition triggers rarely - child tasks usually get
2868          * off their CPU before the parent has a chance to
2869          * get this far into the reaping action)
2870          */
2871         if (child != current) {
2872                 wait_task_inactive(child, 0);
2873                 list_del_init(&child_counter->list_entry);
2874                 update_counter_times(child_counter);
2875         } else {
2876                 struct perf_cpu_context *cpuctx;
2877                 unsigned long flags;
2878                 u64 perf_flags;
2879
2880                 /*
2881                  * Disable and unlink this counter.
2882                  *
2883                  * Be careful about zapping the list - IRQ/NMI context
2884                  * could still be processing it:
2885                  */
2886                 local_irq_save(flags);
2887                 perf_flags = hw_perf_save_disable();
2888
2889                 cpuctx = &__get_cpu_var(perf_cpu_context);
2890
2891                 group_sched_out(child_counter, cpuctx, child_ctx);
2892                 update_counter_times(child_counter);
2893
2894                 list_del_init(&child_counter->list_entry);
2895
2896                 child_ctx->nr_counters--;
2897
2898                 hw_perf_restore(perf_flags);
2899                 local_irq_restore(flags);
2900         }
2901
2902         parent_counter = child_counter->parent;
2903         /*
2904          * It can happen that parent exits first, and has counters
2905          * that are still around due to the child reference. These
2906          * counters need to be zapped - but otherwise linger.
2907          */
2908         if (parent_counter) {
2909                 sync_child_counter(child_counter, parent_counter);
2910                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2911                                          list_entry) {
2912                         if (sub->parent) {
2913                                 sync_child_counter(sub, sub->parent);
2914                                 free_counter(sub);
2915                         }
2916                 }
2917                 free_counter(child_counter);
2918         }
2919 }
2920
2921 /*
2922  * When a child task exits, feed back counter values to parent counters.
2923  *
2924  * Note: we may be running in child context, but the PID is not hashed
2925  * anymore so new counters will not be added.
2926  */
2927 void perf_counter_exit_task(struct task_struct *child)
2928 {
2929         struct perf_counter *child_counter, *tmp;
2930         struct perf_counter_context *child_ctx;
2931
2932         child_ctx = &child->perf_counter_ctx;
2933
2934         if (likely(!child_ctx->nr_counters))
2935                 return;
2936
2937         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2938                                  list_entry)
2939                 __perf_counter_exit_task(child, child_counter, child_ctx);
2940 }
2941
2942 /*
2943  * Initialize the perf_counter context in task_struct
2944  */
2945 void perf_counter_init_task(struct task_struct *child)
2946 {
2947         struct perf_counter_context *child_ctx, *parent_ctx;
2948         struct perf_counter *counter;
2949         struct task_struct *parent = current;
2950
2951         child_ctx  =  &child->perf_counter_ctx;
2952         parent_ctx = &parent->perf_counter_ctx;
2953
2954         __perf_counter_init_context(child_ctx, child);
2955
2956         /*
2957          * This is executed from the parent task context, so inherit
2958          * counters that have been marked for cloning:
2959          */
2960
2961         if (likely(!parent_ctx->nr_counters))
2962                 return;
2963
2964         /*
2965          * Lock the parent list. No need to lock the child - not PID
2966          * hashed yet and not running, so nobody can access it.
2967          */
2968         mutex_lock(&parent_ctx->mutex);
2969
2970         /*
2971          * We dont have to disable NMIs - we are only looking at
2972          * the list, not manipulating it:
2973          */
2974         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2975                 if (!counter->hw_event.inherit)
2976                         continue;
2977
2978                 if (inherit_group(counter, parent,
2979                                   parent_ctx, child, child_ctx))
2980                         break;
2981         }
2982
2983         mutex_unlock(&parent_ctx->mutex);
2984 }
2985
2986 static void __cpuinit perf_counter_init_cpu(int cpu)
2987 {
2988         struct perf_cpu_context *cpuctx;
2989
2990         cpuctx = &per_cpu(perf_cpu_context, cpu);
2991         __perf_counter_init_context(&cpuctx->ctx, NULL);
2992
2993         mutex_lock(&perf_resource_mutex);
2994         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2995         mutex_unlock(&perf_resource_mutex);
2996
2997         hw_perf_counter_setup(cpu);
2998 }
2999
3000 #ifdef CONFIG_HOTPLUG_CPU
3001 static void __perf_counter_exit_cpu(void *info)
3002 {
3003         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
3004         struct perf_counter_context *ctx = &cpuctx->ctx;
3005         struct perf_counter *counter, *tmp;
3006
3007         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
3008                 __perf_counter_remove_from_context(counter);
3009 }
3010 static void perf_counter_exit_cpu(int cpu)
3011 {
3012         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
3013         struct perf_counter_context *ctx = &cpuctx->ctx;
3014
3015         mutex_lock(&ctx->mutex);
3016         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
3017         mutex_unlock(&ctx->mutex);
3018 }
3019 #else
3020 static inline void perf_counter_exit_cpu(int cpu) { }
3021 #endif
3022
3023 static int __cpuinit
3024 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
3025 {
3026         unsigned int cpu = (long)hcpu;
3027
3028         switch (action) {
3029
3030         case CPU_UP_PREPARE:
3031         case CPU_UP_PREPARE_FROZEN:
3032                 perf_counter_init_cpu(cpu);
3033                 break;
3034
3035         case CPU_DOWN_PREPARE:
3036         case CPU_DOWN_PREPARE_FROZEN:
3037                 perf_counter_exit_cpu(cpu);
3038                 break;
3039
3040         default:
3041                 break;
3042         }
3043
3044         return NOTIFY_OK;
3045 }
3046
3047 static struct notifier_block __cpuinitdata perf_cpu_nb = {
3048         .notifier_call          = perf_cpu_notify,
3049 };
3050
3051 static int __init perf_counter_init(void)
3052 {
3053         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
3054                         (void *)(long)smp_processor_id());
3055         register_cpu_notifier(&perf_cpu_nb);
3056
3057         return 0;
3058 }
3059 early_initcall(perf_counter_init);
3060
3061 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
3062 {
3063         return sprintf(buf, "%d\n", perf_reserved_percpu);
3064 }
3065
3066 static ssize_t
3067 perf_set_reserve_percpu(struct sysdev_class *class,
3068                         const char *buf,
3069                         size_t count)
3070 {
3071         struct perf_cpu_context *cpuctx;
3072         unsigned long val;
3073         int err, cpu, mpt;
3074
3075         err = strict_strtoul(buf, 10, &val);
3076         if (err)
3077                 return err;
3078         if (val > perf_max_counters)
3079                 return -EINVAL;
3080
3081         mutex_lock(&perf_resource_mutex);
3082         perf_reserved_percpu = val;
3083         for_each_online_cpu(cpu) {
3084                 cpuctx = &per_cpu(perf_cpu_context, cpu);
3085                 spin_lock_irq(&cpuctx->ctx.lock);
3086                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
3087                           perf_max_counters - perf_reserved_percpu);
3088                 cpuctx->max_pertask = mpt;
3089                 spin_unlock_irq(&cpuctx->ctx.lock);
3090         }
3091         mutex_unlock(&perf_resource_mutex);
3092
3093         return count;
3094 }
3095
3096 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
3097 {
3098         return sprintf(buf, "%d\n", perf_overcommit);
3099 }
3100
3101 static ssize_t
3102 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
3103 {
3104         unsigned long val;
3105         int err;
3106
3107         err = strict_strtoul(buf, 10, &val);
3108         if (err)
3109                 return err;
3110         if (val > 1)
3111                 return -EINVAL;
3112
3113         mutex_lock(&perf_resource_mutex);
3114         perf_overcommit = val;
3115         mutex_unlock(&perf_resource_mutex);
3116
3117         return count;
3118 }
3119
3120 static SYSDEV_CLASS_ATTR(
3121                                 reserve_percpu,
3122                                 0644,
3123                                 perf_show_reserve_percpu,
3124                                 perf_set_reserve_percpu
3125                         );
3126
3127 static SYSDEV_CLASS_ATTR(
3128                                 overcommit,
3129                                 0644,
3130                                 perf_show_overcommit,
3131                                 perf_set_overcommit
3132                         );
3133
3134 static struct attribute *perfclass_attrs[] = {
3135         &attr_reserve_percpu.attr,
3136         &attr_overcommit.attr,
3137         NULL
3138 };
3139
3140 static struct attribute_group perfclass_attr_group = {
3141         .attrs                  = perfclass_attrs,
3142         .name                   = "perf_counters",
3143 };
3144
3145 static int __init perf_counter_sysfs_init(void)
3146 {
3147         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
3148                                   &perfclass_attr_group);
3149 }
3150 device_initcall(perf_counter_sysfs_init);