kernel/perf_counter.c

   1 /*
   2  * Performance counter core code
   3  *
   4  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
   6  *
   7  *  For licencing details see kernel-base/COPYING
   8  */
   9
  10 #include <linux/fs.h>
  11 #include <linux/cpu.h>
  12 #include <linux/smp.h>
  13 #include <linux/file.h>
  14 #include <linux/poll.h>
  15 #include <linux/sysfs.h>
  16 #include <linux/ptrace.h>
  17 #include <linux/percpu.h>
  18 #include <linux/uaccess.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/anon_inodes.h>
  21 #include <linux/kernel_stat.h>
  22 #include <linux/perf_counter.h>
  23 #include <linux/mm.h>
  24 #include <linux/vmstat.h>
  25 #include <linux/rculist.h>
  26 #include <linux/hardirq.h>
  27
  28 #include <asm/irq_regs.h>
  29
  30 /*
  31  * Each CPU has a list of per CPU counters:
  32  */
  33 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
  34
  35 int perf_max_counters __read_mostly = 1;
  36 static int perf_reserved_percpu __read_mostly;
  37 static int perf_overcommit __read_mostly = 1;
  38
  39 /*
  40  * Mutex for (sysadmin-configurable) counter reservations:
  41  */
  42 static DEFINE_MUTEX(perf_resource_mutex);
  43
  44 /*
  45  * Architecture provided APIs - weak aliases:
  46  */
  47 extern __weak const struct hw_perf_counter_ops *
  48 hw_perf_counter_init(struct perf_counter *counter)
  49 {
  50         return NULL;
  51 }
  52
  53 u64 __weak hw_perf_save_disable(void)           { return 0; }
  54 void __weak hw_perf_restore(u64 ctrl)           { barrier(); }
  55 void __weak hw_perf_counter_setup(int cpu)      { barrier(); }
  56 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
  57                struct perf_cpu_context *cpuctx,
  58                struct perf_counter_context *ctx, int cpu)
  59 {
  60         return 0;
  61 }
  62
  63 void __weak perf_counter_print_debug(void)      { }
  64
  65 static void
  66 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  67 {
  68         struct perf_counter *group_leader = counter->group_leader;
  69
  70         /*
  71          * Depending on whether it is a standalone or sibling counter,
  72          * add it straight to the context's counter list, or to the group
  73          * leader's sibling list:
  74          */
  75         if (counter->group_leader == counter)
  76                 list_add_tail(&counter->list_entry, &ctx->counter_list);
  77         else
  78                 list_add_tail(&counter->list_entry, &group_leader->sibling_list);
  79
  80         list_add_rcu(&counter->event_entry, &ctx->event_list);
  81 }
  82
  83 static void
  84 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  85 {
  86         struct perf_counter *sibling, *tmp;
  87
  88         list_del_init(&counter->list_entry);
  89         list_del_rcu(&counter->event_entry);
  90
  91         /*
  92          * If this was a group counter with sibling counters then
  93          * upgrade the siblings to singleton counters by adding them
  94          * to the context list directly:
  95          */
  96         list_for_each_entry_safe(sibling, tmp,
  97                                  &counter->sibling_list, list_entry) {
  98
  99                 list_move_tail(&sibling->list_entry, &ctx->counter_list);
 100                 sibling->group_leader = sibling;
 101         }
 102 }
 103
 104 static void
 105 counter_sched_out(struct perf_counter *counter,
 106                   struct perf_cpu_context *cpuctx,
 107                   struct perf_counter_context *ctx)
 108 {
 109         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 110                 return;
 111
 112         counter->state = PERF_COUNTER_STATE_INACTIVE;
 113         counter->hw_ops->disable(counter);
 114         counter->oncpu = -1;
 115
 116         if (!is_software_counter(counter))
 117                 cpuctx->active_oncpu--;
 118         ctx->nr_active--;
 119         if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
 120                 cpuctx->exclusive = 0;
 121 }
 122
 123 static void
 124 group_sched_out(struct perf_counter *group_counter,
 125                 struct perf_cpu_context *cpuctx,
 126                 struct perf_counter_context *ctx)
 127 {
 128         struct perf_counter *counter;
 129
 130         if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
 131                 return;
 132
 133         counter_sched_out(group_counter, cpuctx, ctx);
 134
 135         /*
 136          * Schedule out siblings (if any):
 137          */
 138         list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 139                 counter_sched_out(counter, cpuctx, ctx);
 140
 141         if (group_counter->hw_event.exclusive)
 142                 cpuctx->exclusive = 0;
 143 }
 144
 145 /*
 146  * Cross CPU call to remove a performance counter
 147  *
 148  * We disable the counter on the hardware level first. After that we
 149  * remove it from the context list.
 150  */
 151 static void __perf_counter_remove_from_context(void *info)
 152 {
 153         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 154         struct perf_counter *counter = info;
 155         struct perf_counter_context *ctx = counter->ctx;
 156         unsigned long flags;
 157         u64 perf_flags;
 158
 159         /*
 160          * If this is a task context, we need to check whether it is
 161          * the current task context of this cpu. If not it has been
 162          * scheduled out before the smp call arrived.
 163          */
 164         if (ctx->task && cpuctx->task_ctx != ctx)
 165                 return;
 166
 167         curr_rq_lock_irq_save(&flags);
 168         spin_lock(&ctx->lock);
 169
 170         counter_sched_out(counter, cpuctx, ctx);
 171
 172         counter->task = NULL;
 173         ctx->nr_counters--;
 174
 175         /*
 176          * Protect the list operation against NMI by disabling the
 177          * counters on a global level. NOP for non NMI based counters.
 178          */
 179         perf_flags = hw_perf_save_disable();
 180         list_del_counter(counter, ctx);
 181         hw_perf_restore(perf_flags);
 182
 183         if (!ctx->task) {
 184                 /*
 185                  * Allow more per task counters with respect to the
 186                  * reservation:
 187                  */
 188                 cpuctx->max_pertask =
 189                         min(perf_max_counters - ctx->nr_counters,
 190                             perf_max_counters - perf_reserved_percpu);
 191         }
 192
 193         spin_unlock(&ctx->lock);
 194         curr_rq_unlock_irq_restore(&flags);
 195 }
 196
 197
 198 /*
 199  * Remove the counter from a task's (or a CPU's) list of counters.
 200  *
 201  * Must be called with counter->mutex and ctx->mutex held.
 202  *
 203  * CPU counters are removed with a smp call. For task counters we only
 204  * call when the task is on a CPU.
 205  */
 206 static void perf_counter_remove_from_context(struct perf_counter *counter)
 207 {
 208         struct perf_counter_context *ctx = counter->ctx;
 209         struct task_struct *task = ctx->task;
 210
 211         if (!task) {
 212                 /*
 213                  * Per cpu counters are removed via an smp call and
 214                  * the removal is always sucessful.
 215                  */
 216                 smp_call_function_single(counter->cpu,
 217                                          __perf_counter_remove_from_context,
 218                                          counter, 1);
 219                 return;
 220         }
 221
 222 retry:
 223         task_oncpu_function_call(task, __perf_counter_remove_from_context,
 224                                  counter);
 225
 226         spin_lock_irq(&ctx->lock);
 227         /*
 228          * If the context is active we need to retry the smp call.
 229          */
 230         if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 231                 spin_unlock_irq(&ctx->lock);
 232                 goto retry;
 233         }
 234
 235         /*
 236          * The lock prevents that this context is scheduled in so we
 237          * can remove the counter safely, if the call above did not
 238          * succeed.
 239          */
 240         if (!list_empty(&counter->list_entry)) {
 241                 ctx->nr_counters--;
 242                 list_del_counter(counter, ctx);
 243                 counter->task = NULL;
 244         }
 245         spin_unlock_irq(&ctx->lock);
 246 }
 247
 248 /*
 249  * Cross CPU call to disable a performance counter
 250  */
 251 static void __perf_counter_disable(void *info)
 252 {
 253         struct perf_counter *counter = info;
 254         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 255         struct perf_counter_context *ctx = counter->ctx;
 256         unsigned long flags;
 257
 258         /*
 259          * If this is a per-task counter, need to check whether this
 260          * counter's task is the current task on this cpu.
 261          */
 262         if (ctx->task && cpuctx->task_ctx != ctx)
 263                 return;
 264
 265         curr_rq_lock_irq_save(&flags);
 266         spin_lock(&ctx->lock);
 267
 268         /*
 269          * If the counter is on, turn it off.
 270          * If it is in error state, leave it in error state.
 271          */
 272         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
 273                 if (counter == counter->group_leader)
 274                         group_sched_out(counter, cpuctx, ctx);
 275                 else
 276                         counter_sched_out(counter, cpuctx, ctx);
 277                 counter->state = PERF_COUNTER_STATE_OFF;
 278         }
 279
 280         spin_unlock(&ctx->lock);
 281         curr_rq_unlock_irq_restore(&flags);
 282 }
 283
 284 /*
 285  * Disable a counter.
 286  */
 287 static void perf_counter_disable(struct perf_counter *counter)
 288 {
 289         struct perf_counter_context *ctx = counter->ctx;
 290         struct task_struct *task = ctx->task;
 291
 292         if (!task) {
 293                 /*
 294                  * Disable the counter on the cpu that it's on
 295                  */
 296                 smp_call_function_single(counter->cpu, __perf_counter_disable,
 297                                          counter, 1);
 298                 return;
 299         }
 300
 301  retry:
 302         task_oncpu_function_call(task, __perf_counter_disable, counter);
 303
 304         spin_lock_irq(&ctx->lock);
 305         /*
 306          * If the counter is still active, we need to retry the cross-call.
 307          */
 308         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 309                 spin_unlock_irq(&ctx->lock);
 310                 goto retry;
 311         }
 312
 313         /*
 314          * Since we have the lock this context can't be scheduled
 315          * in, so we can change the state safely.
 316          */
 317         if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 318                 counter->state = PERF_COUNTER_STATE_OFF;
 319
 320         spin_unlock_irq(&ctx->lock);
 321 }
 322
 323 /*
 324  * Disable a counter and all its children.
 325  */
 326 static void perf_counter_disable_family(struct perf_counter *counter)
 327 {
 328         struct perf_counter *child;
 329
 330         perf_counter_disable(counter);
 331
 332         /*
 333          * Lock the mutex to protect the list of children
 334          */
 335         mutex_lock(&counter->mutex);
 336         list_for_each_entry(child, &counter->child_list, child_list)
 337                 perf_counter_disable(child);
 338         mutex_unlock(&counter->mutex);
 339 }
 340
 341 static int
 342 counter_sched_in(struct perf_counter *counter,
 343                  struct perf_cpu_context *cpuctx,
 344                  struct perf_counter_context *ctx,
 345                  int cpu)
 346 {
 347         if (counter->state <= PERF_COUNTER_STATE_OFF)
 348                 return 0;
 349
 350         counter->state = PERF_COUNTER_STATE_ACTIVE;
 351         counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
 352         /*
 353          * The new state must be visible before we turn it on in the hardware:
 354          */
 355         smp_wmb();
 356
 357         if (counter->hw_ops->enable(counter)) {
 358                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 359                 counter->oncpu = -1;
 360                 return -EAGAIN;
 361         }
 362
 363         if (!is_software_counter(counter))
 364                 cpuctx->active_oncpu++;
 365         ctx->nr_active++;
 366
 367         if (counter->hw_event.exclusive)
 368                 cpuctx->exclusive = 1;
 369
 370         return 0;
 371 }
 372
 373 /*
 374  * Return 1 for a group consisting entirely of software counters,
 375  * 0 if the group contains any hardware counters.
 376  */
 377 static int is_software_only_group(struct perf_counter *leader)
 378 {
 379         struct perf_counter *counter;
 380
 381         if (!is_software_counter(leader))
 382                 return 0;
 383         list_for_each_entry(counter, &leader->sibling_list, list_entry)
 384                 if (!is_software_counter(counter))
 385                         return 0;
 386         return 1;
 387 }
 388
 389 /*
 390  * Work out whether we can put this counter group on the CPU now.
 391  */
 392 static int group_can_go_on(struct perf_counter *counter,
 393                            struct perf_cpu_context *cpuctx,
 394                            int can_add_hw)
 395 {
 396         /*
 397          * Groups consisting entirely of software counters can always go on.
 398          */
 399         if (is_software_only_group(counter))
 400                 return 1;
 401         /*
 402          * If an exclusive group is already on, no other hardware
 403          * counters can go on.
 404          */
 405         if (cpuctx->exclusive)
 406                 return 0;
 407         /*
 408          * If this group is exclusive and there are already
 409          * counters on the CPU, it can't go on.
 410          */
 411         if (counter->hw_event.exclusive && cpuctx->active_oncpu)
 412                 return 0;
 413         /*
 414          * Otherwise, try to add it if all previous groups were able
 415          * to go on.
 416          */
 417         return can_add_hw;
 418 }
 419
 420 /*
 421  * Cross CPU call to install and enable a performance counter
 422  */
 423 static void __perf_install_in_context(void *info)
 424 {
 425         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 426         struct perf_counter *counter = info;
 427         struct perf_counter_context *ctx = counter->ctx;
 428         struct perf_counter *leader = counter->group_leader;
 429         int cpu = smp_processor_id();
 430         unsigned long flags;
 431         u64 perf_flags;
 432         int err;
 433
 434         /*
 435          * If this is a task context, we need to check whether it is
 436          * the current task context of this cpu. If not it has been
 437          * scheduled out before the smp call arrived.
 438          */
 439         if (ctx->task && cpuctx->task_ctx != ctx)
 440                 return;
 441
 442         curr_rq_lock_irq_save(&flags);
 443         spin_lock(&ctx->lock);
 444
 445         /*
 446          * Protect the list operation against NMI by disabling the
 447          * counters on a global level. NOP for non NMI based counters.
 448          */
 449         perf_flags = hw_perf_save_disable();
 450
 451         list_add_counter(counter, ctx);
 452         ctx->nr_counters++;
 453         counter->prev_state = PERF_COUNTER_STATE_OFF;
 454
 455         /*
 456          * Don't put the counter on if it is disabled or if
 457          * it is in a group and the group isn't on.
 458          */
 459         if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
 460             (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
 461                 goto unlock;
 462
 463         /*
 464          * An exclusive counter can't go on if there are already active
 465          * hardware counters, and no hardware counter can go on if there
 466          * is already an exclusive counter on.
 467          */
 468         if (!group_can_go_on(counter, cpuctx, 1))
 469                 err = -EEXIST;
 470         else
 471                 err = counter_sched_in(counter, cpuctx, ctx, cpu);
 472
 473         if (err) {
 474                 /*
 475                  * This counter couldn't go on.  If it is in a group
 476                  * then we have to pull the whole group off.
 477                  * If the counter group is pinned then put it in error state.
 478                  */
 479                 if (leader != counter)
 480                         group_sched_out(leader, cpuctx, ctx);
 481                 if (leader->hw_event.pinned)
 482                         leader->state = PERF_COUNTER_STATE_ERROR;
 483         }
 484
 485         if (!err && !ctx->task && cpuctx->max_pertask)
 486                 cpuctx->max_pertask--;
 487
 488  unlock:
 489         hw_perf_restore(perf_flags);
 490
 491         spin_unlock(&ctx->lock);
 492         curr_rq_unlock_irq_restore(&flags);
 493 }
 494
 495 /*
 496  * Attach a performance counter to a context
 497  *
 498  * First we add the counter to the list with the hardware enable bit
 499  * in counter->hw_config cleared.
 500  *
 501  * If the counter is attached to a task which is on a CPU we use a smp
 502  * call to enable it in the task context. The task might have been
 503  * scheduled away, but we check this in the smp call again.
 504  *
 505  * Must be called with ctx->mutex held.
 506  */
 507 static void
 508 perf_install_in_context(struct perf_counter_context *ctx,
 509                         struct perf_counter *counter,
 510                         int cpu)
 511 {
 512         struct task_struct *task = ctx->task;
 513
 514         if (!task) {
 515                 /*
 516                  * Per cpu counters are installed via an smp call and
 517                  * the install is always sucessful.
 518                  */
 519                 smp_call_function_single(cpu, __perf_install_in_context,
 520                                          counter, 1);
 521                 return;
 522         }
 523
 524         counter->task = task;
 525 retry:
 526         task_oncpu_function_call(task, __perf_install_in_context,
 527                                  counter);
 528
 529         spin_lock_irq(&ctx->lock);
 530         /*
 531          * we need to retry the smp call.
 532          */
 533         if (ctx->is_active && list_empty(&counter->list_entry)) {
 534                 spin_unlock_irq(&ctx->lock);
 535                 goto retry;
 536         }
 537
 538         /*
 539          * The lock prevents that this context is scheduled in so we
 540          * can add the counter safely, if it the call above did not
 541          * succeed.
 542          */
 543         if (list_empty(&counter->list_entry)) {
 544                 list_add_counter(counter, ctx);
 545                 ctx->nr_counters++;
 546         }
 547         spin_unlock_irq(&ctx->lock);
 548 }
 549
 550 /*
 551  * Cross CPU call to enable a performance counter
 552  */
 553 static void __perf_counter_enable(void *info)
 554 {
 555         struct perf_counter *counter = info;
 556         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 557         struct perf_counter_context *ctx = counter->ctx;
 558         struct perf_counter *leader = counter->group_leader;
 559         unsigned long flags;
 560         int err;
 561
 562         /*
 563          * If this is a per-task counter, need to check whether this
 564          * counter's task is the current task on this cpu.
 565          */
 566         if (ctx->task && cpuctx->task_ctx != ctx)
 567                 return;
 568
 569         curr_rq_lock_irq_save(&flags);
 570         spin_lock(&ctx->lock);
 571
 572         counter->prev_state = counter->state;
 573         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 574                 goto unlock;
 575         counter->state = PERF_COUNTER_STATE_INACTIVE;
 576
 577         /*
 578          * If the counter is in a group and isn't the group leader,
 579          * then don't put it on unless the group is on.
 580          */
 581         if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 582                 goto unlock;
 583
 584         if (!group_can_go_on(counter, cpuctx, 1))
 585                 err = -EEXIST;
 586         else
 587                 err = counter_sched_in(counter, cpuctx, ctx,
 588                                        smp_processor_id());
 589
 590         if (err) {
 591                 /*
 592                  * If this counter can't go on and it's part of a
 593                  * group, then the whole group has to come off.
 594                  */
 595                 if (leader != counter)
 596                         group_sched_out(leader, cpuctx, ctx);
 597                 if (leader->hw_event.pinned)
 598                         leader->state = PERF_COUNTER_STATE_ERROR;
 599         }
 600
 601  unlock:
 602         spin_unlock(&ctx->lock);
 603         curr_rq_unlock_irq_restore(&flags);
 604 }
 605
 606 /*
 607  * Enable a counter.
 608  */
 609 static void perf_counter_enable(struct perf_counter *counter)
 610 {
 611         struct perf_counter_context *ctx = counter->ctx;
 612         struct task_struct *task = ctx->task;
 613
 614         if (!task) {
 615                 /*
 616                  * Enable the counter on the cpu that it's on
 617                  */
 618                 smp_call_function_single(counter->cpu, __perf_counter_enable,
 619                                          counter, 1);
 620                 return;
 621         }
 622
 623         spin_lock_irq(&ctx->lock);
 624         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 625                 goto out;
 626
 627         /*
 628          * If the counter is in error state, clear that first.
 629          * That way, if we see the counter in error state below, we
 630          * know that it has gone back into error state, as distinct
 631          * from the task having been scheduled away before the
 632          * cross-call arrived.
 633          */
 634         if (counter->state == PERF_COUNTER_STATE_ERROR)
 635                 counter->state = PERF_COUNTER_STATE_OFF;
 636
 637  retry:
 638         spin_unlock_irq(&ctx->lock);
 639         task_oncpu_function_call(task, __perf_counter_enable, counter);
 640
 641         spin_lock_irq(&ctx->lock);
 642
 643         /*
 644          * If the context is active and the counter is still off,
 645          * we need to retry the cross-call.
 646          */
 647         if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
 648                 goto retry;
 649
 650         /*
 651          * Since we have the lock this context can't be scheduled
 652          * in, so we can change the state safely.
 653          */
 654         if (counter->state == PERF_COUNTER_STATE_OFF)
 655                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 656  out:
 657         spin_unlock_irq(&ctx->lock);
 658 }
 659
 660 /*
 661  * Enable a counter and all its children.
 662  */
 663 static void perf_counter_enable_family(struct perf_counter *counter)
 664 {
 665         struct perf_counter *child;
 666
 667         perf_counter_enable(counter);
 668
 669         /*
 670          * Lock the mutex to protect the list of children
 671          */
 672         mutex_lock(&counter->mutex);
 673         list_for_each_entry(child, &counter->child_list, child_list)
 674                 perf_counter_enable(child);
 675         mutex_unlock(&counter->mutex);
 676 }
 677
 678 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 679                               struct perf_cpu_context *cpuctx)
 680 {
 681         struct perf_counter *counter;
 682         u64 flags;
 683
 684         spin_lock(&ctx->lock);
 685         ctx->is_active = 0;
 686         if (likely(!ctx->nr_counters))
 687                 goto out;
 688
 689         flags = hw_perf_save_disable();
 690         if (ctx->nr_active) {
 691                 list_for_each_entry(counter, &ctx->counter_list, list_entry)
 692                         group_sched_out(counter, cpuctx, ctx);
 693         }
 694         hw_perf_restore(flags);
 695  out:
 696         spin_unlock(&ctx->lock);
 697 }
 698
 699 /*
 700  * Called from scheduler to remove the counters of the current task,
 701  * with interrupts disabled.
 702  *
 703  * We stop each counter and update the counter value in counter->count.
 704  *
 705  * This does not protect us against NMI, but disable()
 706  * sets the disabled bit in the control field of counter _before_
 707  * accessing the counter control register. If a NMI hits, then it will
 708  * not restart the counter.
 709  */
 710 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 711 {
 712         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 713         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 714         struct pt_regs *regs;
 715
 716         if (likely(!cpuctx->task_ctx))
 717                 return;
 718
 719         regs = task_pt_regs(task);
 720         perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 721         __perf_counter_sched_out(ctx, cpuctx);
 722
 723         cpuctx->task_ctx = NULL;
 724 }
 725
 726 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 727 {
 728         __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 729 }
 730
 731 static int
 732 group_sched_in(struct perf_counter *group_counter,
 733                struct perf_cpu_context *cpuctx,
 734                struct perf_counter_context *ctx,
 735                int cpu)
 736 {
 737         struct perf_counter *counter, *partial_group;
 738         int ret;
 739
 740         if (group_counter->state == PERF_COUNTER_STATE_OFF)
 741                 return 0;
 742
 743         ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
 744         if (ret)
 745                 return ret < 0 ? ret : 0;
 746
 747         group_counter->prev_state = group_counter->state;
 748         if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 749                 return -EAGAIN;
 750
 751         /*
 752          * Schedule in siblings as one group (if any):
 753          */
 754         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 755                 counter->prev_state = counter->state;
 756                 if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 757                         partial_group = counter;
 758                         goto group_error;
 759                 }
 760         }
 761
 762         return 0;
 763
 764 group_error:
 765         /*
 766          * Groups can be scheduled in as one unit only, so undo any
 767          * partial group before returning:
 768          */
 769         list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 770                 if (counter == partial_group)
 771                         break;
 772                 counter_sched_out(counter, cpuctx, ctx);
 773         }
 774         counter_sched_out(group_counter, cpuctx, ctx);
 775
 776         return -EAGAIN;
 777 }
 778
 779 static void
 780 __perf_counter_sched_in(struct perf_counter_context *ctx,
 781                         struct perf_cpu_context *cpuctx, int cpu)
 782 {
 783         struct perf_counter *counter;
 784         u64 flags;
 785         int can_add_hw = 1;
 786
 787         spin_lock(&ctx->lock);
 788         ctx->is_active = 1;
 789         if (likely(!ctx->nr_counters))
 790                 goto out;
 791
 792         flags = hw_perf_save_disable();
 793
 794         /*
 795          * First go through the list and put on any pinned groups
 796          * in order to give them the best chance of going on.
 797          */
 798         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 799                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 800                     !counter->hw_event.pinned)
 801                         continue;
 802                 if (counter->cpu != -1 && counter->cpu != cpu)
 803                         continue;
 804
 805                 if (group_can_go_on(counter, cpuctx, 1))
 806                         group_sched_in(counter, cpuctx, ctx, cpu);
 807
 808                 /*
 809                  * If this pinned group hasn't been scheduled,
 810                  * put it in error state.
 811                  */
 812                 if (counter->state == PERF_COUNTER_STATE_INACTIVE)
 813                         counter->state = PERF_COUNTER_STATE_ERROR;
 814         }
 815
 816         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 817                 /*
 818                  * Ignore counters in OFF or ERROR state, and
 819                  * ignore pinned counters since we did them already.
 820                  */
 821                 if (counter->state <= PERF_COUNTER_STATE_OFF ||
 822                     counter->hw_event.pinned)
 823                         continue;
 824
 825                 /*
 826                  * Listen to the 'cpu' scheduling filter constraint
 827                  * of counters:
 828                  */
 829                 if (counter->cpu != -1 && counter->cpu != cpu)
 830                         continue;
 831
 832                 if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 833                         if (group_sched_in(counter, cpuctx, ctx, cpu))
 834                                 can_add_hw = 0;
 835                 }
 836         }
 837         hw_perf_restore(flags);
 838  out:
 839         spin_unlock(&ctx->lock);
 840 }
 841
 842 /*
 843  * Called from scheduler to add the counters of the current task
 844  * with interrupts disabled.
 845  *
 846  * We restore the counter value and then enable it.
 847  *
 848  * This does not protect us against NMI, but enable()
 849  * sets the enabled bit in the control field of counter _before_
 850  * accessing the counter control register. If a NMI hits, then it will
 851  * keep the counter running.
 852  */
 853 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 854 {
 855         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 856         struct perf_counter_context *ctx = &task->perf_counter_ctx;
 857
 858         __perf_counter_sched_in(ctx, cpuctx, cpu);
 859         cpuctx->task_ctx = ctx;
 860 }
 861
 862 static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 863 {
 864         struct perf_counter_context *ctx = &cpuctx->ctx;
 865
 866         __perf_counter_sched_in(ctx, cpuctx, cpu);
 867 }
 868
 869 int perf_counter_task_disable(void)
 870 {
 871         struct task_struct *curr = current;
 872         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 873         struct perf_counter *counter;
 874         unsigned long flags;
 875         u64 perf_flags;
 876         int cpu;
 877
 878         if (likely(!ctx->nr_counters))
 879                 return 0;
 880
 881         curr_rq_lock_irq_save(&flags);
 882         cpu = smp_processor_id();
 883
 884         /* force the update of the task clock: */
 885         __task_delta_exec(curr, 1);
 886
 887         perf_counter_task_sched_out(curr, cpu);
 888
 889         spin_lock(&ctx->lock);
 890
 891         /*
 892          * Disable all the counters:
 893          */
 894         perf_flags = hw_perf_save_disable();
 895
 896         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 897                 if (counter->state != PERF_COUNTER_STATE_ERROR)
 898                         counter->state = PERF_COUNTER_STATE_OFF;
 899         }
 900
 901         hw_perf_restore(perf_flags);
 902
 903         spin_unlock(&ctx->lock);
 904
 905         curr_rq_unlock_irq_restore(&flags);
 906
 907         return 0;
 908 }
 909
 910 int perf_counter_task_enable(void)
 911 {
 912         struct task_struct *curr = current;
 913         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 914         struct perf_counter *counter;
 915         unsigned long flags;
 916         u64 perf_flags;
 917         int cpu;
 918
 919         if (likely(!ctx->nr_counters))
 920                 return 0;
 921
 922         curr_rq_lock_irq_save(&flags);
 923         cpu = smp_processor_id();
 924
 925         /* force the update of the task clock: */
 926         __task_delta_exec(curr, 1);
 927
 928         perf_counter_task_sched_out(curr, cpu);
 929
 930         spin_lock(&ctx->lock);
 931
 932         /*
 933          * Disable all the counters:
 934          */
 935         perf_flags = hw_perf_save_disable();
 936
 937         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 938                 if (counter->state > PERF_COUNTER_STATE_OFF)
 939                         continue;
 940                 counter->state = PERF_COUNTER_STATE_INACTIVE;
 941                 counter->hw_event.disabled = 0;
 942         }
 943         hw_perf_restore(perf_flags);
 944
 945         spin_unlock(&ctx->lock);
 946
 947         perf_counter_task_sched_in(curr, cpu);
 948
 949         curr_rq_unlock_irq_restore(&flags);
 950
 951         return 0;
 952 }
 953
 954 /*
 955  * Round-robin a context's counters:
 956  */
 957 static void rotate_ctx(struct perf_counter_context *ctx)
 958 {
 959         struct perf_counter *counter;
 960         u64 perf_flags;
 961
 962         if (!ctx->nr_counters)
 963                 return;
 964
 965         spin_lock(&ctx->lock);
 966         /*
 967          * Rotate the first entry last (works just fine for group counters too):
 968          */
 969         perf_flags = hw_perf_save_disable();
 970         list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 971                 list_move_tail(&counter->list_entry, &ctx->counter_list);
 972                 break;
 973         }
 974         hw_perf_restore(perf_flags);
 975
 976         spin_unlock(&ctx->lock);
 977 }
 978
 979 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 980 {
 981         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 982         struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 983         const int rotate_percpu = 0;
 984
 985         if (rotate_percpu)
 986                 perf_counter_cpu_sched_out(cpuctx);
 987         perf_counter_task_sched_out(curr, cpu);
 988
 989         if (rotate_percpu)
 990                 rotate_ctx(&cpuctx->ctx);
 991         rotate_ctx(ctx);
 992
 993         if (rotate_percpu)
 994                 perf_counter_cpu_sched_in(cpuctx, cpu);
 995         perf_counter_task_sched_in(curr, cpu);
 996 }
 997
 998 /*
 999  * Cross CPU call to read the hardware counter
1000  */
1001 static void __read(void *info)
1002 {
1003         struct perf_counter *counter = info;
1004         unsigned long flags;
1005
1006         curr_rq_lock_irq_save(&flags);
1007         counter->hw_ops->read(counter);
1008         curr_rq_unlock_irq_restore(&flags);
1009 }
1010
1011 static u64 perf_counter_read(struct perf_counter *counter)
1012 {
1013         /*
1014          * If counter is enabled and currently active on a CPU, update the
1015          * value in the counter structure:
1016          */
1017         if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
1018                 smp_call_function_single(counter->oncpu,
1019                                          __read, counter, 1);
1020         }
1021
1022         return atomic64_read(&counter->count);
1023 }
1024
1025 /*
1026  * Cross CPU call to switch performance data pointers
1027  */
1028 static void __perf_switch_irq_data(void *info)
1029 {
1030         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1031         struct perf_counter *counter = info;
1032         struct perf_counter_context *ctx = counter->ctx;
1033         struct perf_data *oldirqdata = counter->irqdata;
1034
1035         /*
1036          * If this is a task context, we need to check whether it is
1037          * the current task context of this cpu. If not it has been
1038          * scheduled out before the smp call arrived.
1039          */
1040         if (ctx->task) {
1041                 if (cpuctx->task_ctx != ctx)
1042                         return;
1043                 spin_lock(&ctx->lock);
1044         }
1045
1046         /* Change the pointer NMI safe */
1047         atomic_long_set((atomic_long_t *)&counter->irqdata,
1048                         (unsigned long) counter->usrdata);
1049         counter->usrdata = oldirqdata;
1050
1051         if (ctx->task)
1052                 spin_unlock(&ctx->lock);
1053 }
1054
1055 static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
1056 {
1057         struct perf_counter_context *ctx = counter->ctx;
1058         struct perf_data *oldirqdata = counter->irqdata;
1059         struct task_struct *task = ctx->task;
1060
1061         if (!task) {
1062                 smp_call_function_single(counter->cpu,
1063                                          __perf_switch_irq_data,
1064                                          counter, 1);
1065                 return counter->usrdata;
1066         }
1067
1068 retry:
1069         spin_lock_irq(&ctx->lock);
1070         if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
1071                 counter->irqdata = counter->usrdata;
1072                 counter->usrdata = oldirqdata;
1073                 spin_unlock_irq(&ctx->lock);
1074                 return oldirqdata;
1075         }
1076         spin_unlock_irq(&ctx->lock);
1077         task_oncpu_function_call(task, __perf_switch_irq_data, counter);
1078         /* Might have failed, because task was scheduled out */
1079         if (counter->irqdata == oldirqdata)
1080                 goto retry;
1081
1082         return counter->usrdata;
1083 }
1084
1085 static void put_context(struct perf_counter_context *ctx)
1086 {
1087         if (ctx->task)
1088                 put_task_struct(ctx->task);
1089 }
1090
1091 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
1092 {
1093         struct perf_cpu_context *cpuctx;
1094         struct perf_counter_context *ctx;
1095         struct task_struct *task;
1096
1097         /*
1098          * If cpu is not a wildcard then this is a percpu counter:
1099          */
1100         if (cpu != -1) {
1101                 /* Must be root to operate on a CPU counter: */
1102                 if (!capable(CAP_SYS_ADMIN))
1103                         return ERR_PTR(-EACCES);
1104
1105                 if (cpu < 0 || cpu > num_possible_cpus())
1106                         return ERR_PTR(-EINVAL);
1107
1108                 /*
1109                  * We could be clever and allow to attach a counter to an
1110                  * offline CPU and activate it when the CPU comes up, but
1111                  * that's for later.
1112                  */
1113                 if (!cpu_isset(cpu, cpu_online_map))
1114                         return ERR_PTR(-ENODEV);
1115
1116                 cpuctx = &per_cpu(perf_cpu_context, cpu);
1117                 ctx = &cpuctx->ctx;
1118
1119                 return ctx;
1120         }
1121
1122         rcu_read_lock();
1123         if (!pid)
1124                 task = current;
1125         else
1126                 task = find_task_by_vpid(pid);
1127         if (task)
1128                 get_task_struct(task);
1129         rcu_read_unlock();
1130
1131         if (!task)
1132                 return ERR_PTR(-ESRCH);
1133
1134         ctx = &task->perf_counter_ctx;
1135         ctx->task = task;
1136
1137         /* Reuse ptrace permission checks for now. */
1138         if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
1139                 put_context(ctx);
1140                 return ERR_PTR(-EACCES);
1141         }
1142
1143         return ctx;
1144 }
1145
1146 static void free_counter_rcu(struct rcu_head *head)
1147 {
1148         struct perf_counter *counter;
1149
1150         counter = container_of(head, struct perf_counter, rcu_head);
1151         kfree(counter);
1152 }
1153
1154 static void free_counter(struct perf_counter *counter)
1155 {
1156         if (counter->destroy)
1157                 counter->destroy(counter);
1158
1159         call_rcu(&counter->rcu_head, free_counter_rcu);
1160 }
1161
1162 /*
1163  * Called when the last reference to the file is gone.
1164  */
1165 static int perf_release(struct inode *inode, struct file *file)
1166 {
1167         struct perf_counter *counter = file->private_data;
1168         struct perf_counter_context *ctx = counter->ctx;
1169
1170         file->private_data = NULL;
1171
1172         mutex_lock(&ctx->mutex);
1173         mutex_lock(&counter->mutex);
1174
1175         perf_counter_remove_from_context(counter);
1176
1177         mutex_unlock(&counter->mutex);
1178         mutex_unlock(&ctx->mutex);
1179
1180         free_page(counter->user_page);
1181         free_counter(counter);
1182         put_context(ctx);
1183
1184         return 0;
1185 }
1186
1187 /*
1188  * Read the performance counter - simple non blocking version for now
1189  */
1190 static ssize_t
1191 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
1192 {
1193         u64 cntval;
1194
1195         if (count != sizeof(cntval))
1196                 return -EINVAL;
1197
1198         /*
1199          * Return end-of-file for a read on a counter that is in
1200          * error state (i.e. because it was pinned but it couldn't be
1201          * scheduled on to the CPU at some point).
1202          */
1203         if (counter->state == PERF_COUNTER_STATE_ERROR)
1204                 return 0;
1205
1206         mutex_lock(&counter->mutex);
1207         cntval = perf_counter_read(counter);
1208         mutex_unlock(&counter->mutex);
1209
1210         return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
1211 }
1212
1213 static ssize_t
1214 perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
1215 {
1216         if (!usrdata->len)
1217                 return 0;
1218
1219         count = min(count, (size_t)usrdata->len);
1220         if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
1221                 return -EFAULT;
1222
1223         /* Adjust the counters */
1224         usrdata->len -= count;
1225         if (!usrdata->len)
1226                 usrdata->rd_idx = 0;
1227         else
1228                 usrdata->rd_idx += count;
1229
1230         return count;
1231 }
1232
1233 static ssize_t
1234 perf_read_irq_data(struct perf_counter  *counter,
1235                    char __user          *buf,
1236                    size_t               count,
1237                    int                  nonblocking)
1238 {
1239         struct perf_data *irqdata, *usrdata;
1240         DECLARE_WAITQUEUE(wait, current);
1241         ssize_t res, res2;
1242
1243         irqdata = counter->irqdata;
1244         usrdata = counter->usrdata;
1245
1246         if (usrdata->len + irqdata->len >= count)
1247                 goto read_pending;
1248
1249         if (nonblocking)
1250                 return -EAGAIN;
1251
1252         spin_lock_irq(&counter->waitq.lock);
1253         __add_wait_queue(&counter->waitq, &wait);
1254         for (;;) {
1255                 set_current_state(TASK_INTERRUPTIBLE);
1256                 if (usrdata->len + irqdata->len >= count)
1257                         break;
1258
1259                 if (signal_pending(current))
1260                         break;
1261
1262                 if (counter->state == PERF_COUNTER_STATE_ERROR)
1263                         break;
1264
1265                 spin_unlock_irq(&counter->waitq.lock);
1266                 schedule();
1267                 spin_lock_irq(&counter->waitq.lock);
1268         }
1269         __remove_wait_queue(&counter->waitq, &wait);
1270         __set_current_state(TASK_RUNNING);
1271         spin_unlock_irq(&counter->waitq.lock);
1272
1273         if (usrdata->len + irqdata->len < count &&
1274             counter->state != PERF_COUNTER_STATE_ERROR)
1275                 return -ERESTARTSYS;
1276 read_pending:
1277         mutex_lock(&counter->mutex);
1278
1279         /* Drain pending data first: */
1280         res = perf_copy_usrdata(usrdata, buf, count);
1281         if (res < 0 || res == count)
1282                 goto out;
1283
1284         /* Switch irq buffer: */
1285         usrdata = perf_switch_irq_data(counter);
1286         res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
1287         if (res2 < 0) {
1288                 if (!res)
1289                         res = -EFAULT;
1290         } else {
1291                 res += res2;
1292         }
1293 out:
1294         mutex_unlock(&counter->mutex);
1295
1296         return res;
1297 }
1298
1299 static ssize_t
1300 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
1301 {
1302         struct perf_counter *counter = file->private_data;
1303
1304         switch (counter->hw_event.record_type) {
1305         case PERF_RECORD_SIMPLE:
1306                 return perf_read_hw(counter, buf, count);
1307
1308         case PERF_RECORD_IRQ:
1309         case PERF_RECORD_GROUP:
1310                 return perf_read_irq_data(counter, buf, count,
1311                                           file->f_flags & O_NONBLOCK);
1312         }
1313         return -EINVAL;
1314 }
1315
1316 static unsigned int perf_poll(struct file *file, poll_table *wait)
1317 {
1318         struct perf_counter *counter = file->private_data;
1319         unsigned int events = 0;
1320         unsigned long flags;
1321
1322         poll_wait(file, &counter->waitq, wait);
1323
1324         spin_lock_irqsave(&counter->waitq.lock, flags);
1325         if (counter->usrdata->len || counter->irqdata->len)
1326                 events |= POLLIN;
1327         spin_unlock_irqrestore(&counter->waitq.lock, flags);
1328
1329         return events;
1330 }
1331
1332 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
1333 {
1334         struct perf_counter *counter = file->private_data;
1335         int err = 0;
1336
1337         switch (cmd) {
1338         case PERF_COUNTER_IOC_ENABLE:
1339                 perf_counter_enable_family(counter);
1340                 break;
1341         case PERF_COUNTER_IOC_DISABLE:
1342                 perf_counter_disable_family(counter);
1343                 break;
1344         default:
1345                 err = -ENOTTY;
1346         }
1347         return err;
1348 }
1349
1350 void perf_counter_update_userpage(struct perf_counter *counter)
1351 {
1352         struct perf_counter_mmap_page *userpg;
1353
1354         if (!counter->user_page)
1355                 return;
1356         userpg = (struct perf_counter_mmap_page *) counter->user_page;
1357
1358         ++userpg->lock;
1359         smp_wmb();
1360         userpg->index = counter->hw.idx;
1361         userpg->offset = atomic64_read(&counter->count);
1362         if (counter->state == PERF_COUNTER_STATE_ACTIVE)
1363                 userpg->offset -= atomic64_read(&counter->hw.prev_count);
1364         smp_wmb();
1365         ++userpg->lock;
1366 }
1367
1368 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1369 {
1370         struct perf_counter *counter = vma->vm_file->private_data;
1371
1372         if (!counter->user_page)
1373                 return VM_FAULT_SIGBUS;
1374
1375         vmf->page = virt_to_page(counter->user_page);
1376         get_page(vmf->page);
1377         return 0;
1378 }
1379
1380 static struct vm_operations_struct perf_mmap_vmops = {
1381         .fault = perf_mmap_fault,
1382 };
1383
1384 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
1385 {
1386         struct perf_counter *counter = file->private_data;
1387         unsigned long userpg;
1388
1389         if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
1390                 return -EINVAL;
1391         if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1392                 return -EINVAL;
1393
1394         /*
1395          * For now, restrict to the case of a hardware counter
1396          * on the current task.
1397          */
1398         if (is_software_counter(counter) || counter->task != current)
1399                 return -EINVAL;
1400
1401         userpg = counter->user_page;
1402         if (!userpg) {
1403                 userpg = get_zeroed_page(GFP_KERNEL);
1404                 mutex_lock(&counter->mutex);
1405                 if (counter->user_page) {
1406                         free_page(userpg);
1407                         userpg = counter->user_page;
1408                 } else {
1409                         counter->user_page = userpg;
1410                 }
1411                 mutex_unlock(&counter->mutex);
1412                 if (!userpg)
1413                         return -ENOMEM;
1414         }
1415
1416         perf_counter_update_userpage(counter);
1417
1418         vma->vm_flags &= ~VM_MAYWRITE;
1419         vma->vm_flags |= VM_RESERVED;
1420         vma->vm_ops = &perf_mmap_vmops;
1421         return 0;
1422 }
1423
1424 static const struct file_operations perf_fops = {
1425         .release                = perf_release,
1426         .read                   = perf_read,
1427         .poll                   = perf_poll,
1428         .unlocked_ioctl         = perf_ioctl,
1429         .compat_ioctl           = perf_ioctl,
1430         .mmap                   = perf_mmap,
1431 };
1432
1433 /*
1434  * Output
1435  */
1436
1437 static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
1438 {
1439         struct perf_data *irqdata = counter->irqdata;
1440
1441         if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
1442                 irqdata->overrun++;
1443         } else {
1444                 u64 *p = (u64 *) &irqdata->data[irqdata->len];
1445
1446                 *p = data;
1447                 irqdata->len += sizeof(u64);
1448         }
1449 }
1450
1451 static void perf_counter_handle_group(struct perf_counter *counter)
1452 {
1453         struct perf_counter *leader, *sub;
1454
1455         leader = counter->group_leader;
1456         list_for_each_entry(sub, &leader->sibling_list, list_entry) {
1457                 if (sub != counter)
1458                         sub->hw_ops->read(sub);
1459                 perf_counter_store_irq(counter, sub->hw_event.config);
1460                 perf_counter_store_irq(counter, atomic64_read(&sub->count));
1461         }
1462 }
1463
1464 void perf_counter_output(struct perf_counter *counter,
1465                          int nmi, struct pt_regs *regs)
1466 {
1467         switch (counter->hw_event.record_type) {
1468         case PERF_RECORD_SIMPLE:
1469                 return;
1470
1471         case PERF_RECORD_IRQ:
1472                 perf_counter_store_irq(counter, instruction_pointer(regs));
1473                 break;
1474
1475         case PERF_RECORD_GROUP:
1476                 perf_counter_handle_group(counter);
1477                 break;
1478         }
1479
1480         if (nmi) {
1481                 counter->wakeup_pending = 1;
1482                 set_perf_counter_pending();
1483         } else
1484                 wake_up(&counter->waitq);
1485 }
1486
1487 /*
1488  * Generic software counter infrastructure
1489  */
1490
1491 static void perf_swcounter_update(struct perf_counter *counter)
1492 {
1493         struct hw_perf_counter *hwc = &counter->hw;
1494         u64 prev, now;
1495         s64 delta;
1496
1497 again:
1498         prev = atomic64_read(&hwc->prev_count);
1499         now = atomic64_read(&hwc->count);
1500         if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
1501                 goto again;
1502
1503         delta = now - prev;
1504
1505         atomic64_add(delta, &counter->count);
1506         atomic64_sub(delta, &hwc->period_left);
1507 }
1508
1509 static void perf_swcounter_set_period(struct perf_counter *counter)
1510 {
1511         struct hw_perf_counter *hwc = &counter->hw;
1512         s64 left = atomic64_read(&hwc->period_left);
1513         s64 period = hwc->irq_period;
1514
1515         if (unlikely(left <= -period)) {
1516                 left = period;
1517                 atomic64_set(&hwc->period_left, left);
1518         }
1519
1520         if (unlikely(left <= 0)) {
1521                 left += period;
1522                 atomic64_add(period, &hwc->period_left);
1523         }
1524
1525         atomic64_set(&hwc->prev_count, -left);
1526         atomic64_set(&hwc->count, -left);
1527 }
1528
1529 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
1530 {
1531         struct perf_counter *counter;
1532         struct pt_regs *regs;
1533
1534         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
1535         counter->hw_ops->read(counter);
1536
1537         regs = get_irq_regs();
1538         /*
1539          * In case we exclude kernel IPs or are somehow not in interrupt
1540          * context, provide the next best thing, the user IP.
1541          */
1542         if ((counter->hw_event.exclude_kernel || !regs) &&
1543                         !counter->hw_event.exclude_user)
1544                 regs = task_pt_regs(current);
1545
1546         if (regs)
1547                 perf_counter_output(counter, 0, regs);
1548
1549         hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
1550
1551         return HRTIMER_RESTART;
1552 }
1553
1554 static void perf_swcounter_overflow(struct perf_counter *counter,
1555                                     int nmi, struct pt_regs *regs)
1556 {
1557         perf_swcounter_update(counter);
1558         perf_swcounter_set_period(counter);
1559         perf_counter_output(counter, nmi, regs);
1560 }
1561
1562 static int perf_swcounter_match(struct perf_counter *counter,
1563                                 enum perf_event_types type,
1564                                 u32 event, struct pt_regs *regs)
1565 {
1566         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
1567                 return 0;
1568
1569         if (perf_event_raw(&counter->hw_event))
1570                 return 0;
1571
1572         if (perf_event_type(&counter->hw_event) != type)
1573                 return 0;
1574
1575         if (perf_event_id(&counter->hw_event) != event)
1576                 return 0;
1577
1578         if (counter->hw_event.exclude_user && user_mode(regs))
1579                 return 0;
1580
1581         if (counter->hw_event.exclude_kernel && !user_mode(regs))
1582                 return 0;
1583
1584         return 1;
1585 }
1586
1587 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
1588                                int nmi, struct pt_regs *regs)
1589 {
1590         int neg = atomic64_add_negative(nr, &counter->hw.count);
1591         if (counter->hw.irq_period && !neg)
1592                 perf_swcounter_overflow(counter, nmi, regs);
1593 }
1594
1595 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
1596                                      enum perf_event_types type, u32 event,
1597                                      u64 nr, int nmi, struct pt_regs *regs)
1598 {
1599         struct perf_counter *counter;
1600
1601         if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
1602                 return;
1603
1604         rcu_read_lock();
1605         list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
1606                 if (perf_swcounter_match(counter, type, event, regs))
1607                         perf_swcounter_add(counter, nr, nmi, regs);
1608         }
1609         rcu_read_unlock();
1610 }
1611
1612 static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
1613 {
1614         if (in_nmi())
1615                 return &cpuctx->recursion[3];
1616
1617         if (in_irq())
1618                 return &cpuctx->recursion[2];
1619
1620         if (in_softirq())
1621                 return &cpuctx->recursion[1];
1622
1623         return &cpuctx->recursion[0];
1624 }
1625
1626 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
1627                                    u64 nr, int nmi, struct pt_regs *regs)
1628 {
1629         struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
1630         int *recursion = perf_swcounter_recursion_context(cpuctx);
1631
1632         if (*recursion)
1633                 goto out;
1634
1635         (*recursion)++;
1636         barrier();
1637
1638         perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
1639         if (cpuctx->task_ctx) {
1640                 perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
1641                                 nr, nmi, regs);
1642         }
1643
1644         barrier();
1645         (*recursion)--;
1646
1647 out:
1648         put_cpu_var(perf_cpu_context);
1649 }
1650
1651 void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
1652 {
1653         __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
1654 }
1655
1656 static void perf_swcounter_read(struct perf_counter *counter)
1657 {
1658         perf_swcounter_update(counter);
1659 }
1660
1661 static int perf_swcounter_enable(struct perf_counter *counter)
1662 {
1663         perf_swcounter_set_period(counter);
1664         return 0;
1665 }
1666
1667 static void perf_swcounter_disable(struct perf_counter *counter)
1668 {
1669         perf_swcounter_update(counter);
1670 }
1671
1672 static const struct hw_perf_counter_ops perf_ops_generic = {
1673         .enable         = perf_swcounter_enable,
1674         .disable        = perf_swcounter_disable,
1675         .read           = perf_swcounter_read,
1676 };
1677
1678 /*
1679  * Software counter: cpu wall time clock
1680  */
1681
1682 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
1683 {
1684         int cpu = raw_smp_processor_id();
1685         s64 prev;
1686         u64 now;
1687
1688         now = cpu_clock(cpu);
1689         prev = atomic64_read(&counter->hw.prev_count);
1690         atomic64_set(&counter->hw.prev_count, now);
1691         atomic64_add(now - prev, &counter->count);
1692 }
1693
1694 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
1695 {
1696         struct hw_perf_counter *hwc = &counter->hw;
1697         int cpu = raw_smp_processor_id();
1698
1699         atomic64_set(&hwc->prev_count, cpu_clock(cpu));
1700         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1701         hwc->hrtimer.function = perf_swcounter_hrtimer;
1702         if (hwc->irq_period) {
1703                 __hrtimer_start_range_ns(&hwc->hrtimer,
1704                                 ns_to_ktime(hwc->irq_period), 0,
1705                                 HRTIMER_MODE_REL, 0);
1706         }
1707
1708         return 0;
1709 }
1710
1711 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
1712 {
1713         hrtimer_cancel(&counter->hw.hrtimer);
1714         cpu_clock_perf_counter_update(counter);
1715 }
1716
1717 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
1718 {
1719         cpu_clock_perf_counter_update(counter);
1720 }
1721
1722 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
1723         .enable         = cpu_clock_perf_counter_enable,
1724         .disable        = cpu_clock_perf_counter_disable,
1725         .read           = cpu_clock_perf_counter_read,
1726 };
1727
1728 /*
1729  * Software counter: task time clock
1730  */
1731
1732 /*
1733  * Called from within the scheduler:
1734  */
1735 static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
1736 {
1737         struct task_struct *curr = counter->task;
1738         u64 delta;
1739
1740         delta = __task_delta_exec(curr, update);
1741
1742         return curr->se.sum_exec_runtime + delta;
1743 }
1744
1745 static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
1746 {
1747         u64 prev;
1748         s64 delta;
1749
1750         prev = atomic64_read(&counter->hw.prev_count);
1751
1752         atomic64_set(&counter->hw.prev_count, now);
1753
1754         delta = now - prev;
1755
1756         atomic64_add(delta, &counter->count);
1757 }
1758
1759 static int task_clock_perf_counter_enable(struct perf_counter *counter)
1760 {
1761         struct hw_perf_counter *hwc = &counter->hw;
1762
1763         atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
1764         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1765         hwc->hrtimer.function = perf_swcounter_hrtimer;
1766         if (hwc->irq_period) {
1767                 __hrtimer_start_range_ns(&hwc->hrtimer,
1768                                 ns_to_ktime(hwc->irq_period), 0,
1769                                 HRTIMER_MODE_REL, 0);
1770         }
1771
1772         return 0;
1773 }
1774
1775 static void task_clock_perf_counter_disable(struct perf_counter *counter)
1776 {
1777         hrtimer_cancel(&counter->hw.hrtimer);
1778         task_clock_perf_counter_update(counter,
1779                         task_clock_perf_counter_val(counter, 0));
1780 }
1781
1782 static void task_clock_perf_counter_read(struct perf_counter *counter)
1783 {
1784         task_clock_perf_counter_update(counter,
1785                         task_clock_perf_counter_val(counter, 1));
1786 }
1787
1788 static const struct hw_perf_counter_ops perf_ops_task_clock = {
1789         .enable         = task_clock_perf_counter_enable,
1790         .disable        = task_clock_perf_counter_disable,
1791         .read           = task_clock_perf_counter_read,
1792 };
1793
1794 /*
1795  * Software counter: cpu migrations
1796  */
1797
1798 static inline u64 get_cpu_migrations(struct perf_counter *counter)
1799 {
1800         struct task_struct *curr = counter->ctx->task;
1801
1802         if (curr)
1803                 return curr->se.nr_migrations;
1804         return cpu_nr_migrations(smp_processor_id());
1805 }
1806
1807 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
1808 {
1809         u64 prev, now;
1810         s64 delta;
1811
1812         prev = atomic64_read(&counter->hw.prev_count);
1813         now = get_cpu_migrations(counter);
1814
1815         atomic64_set(&counter->hw.prev_count, now);
1816
1817         delta = now - prev;
1818
1819         atomic64_add(delta, &counter->count);
1820 }
1821
1822 static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
1823 {
1824         cpu_migrations_perf_counter_update(counter);
1825 }
1826
1827 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
1828 {
1829         if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
1830                 atomic64_set(&counter->hw.prev_count,
1831                              get_cpu_migrations(counter));
1832         return 0;
1833 }
1834
1835 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
1836 {
1837         cpu_migrations_perf_counter_update(counter);
1838 }
1839
1840 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
1841         .enable         = cpu_migrations_perf_counter_enable,
1842         .disable        = cpu_migrations_perf_counter_disable,
1843         .read           = cpu_migrations_perf_counter_read,
1844 };
1845
1846 #ifdef CONFIG_EVENT_PROFILE
1847 void perf_tpcounter_event(int event_id)
1848 {
1849         struct pt_regs *regs = get_irq_regs();
1850
1851         if (!regs)
1852                 regs = task_pt_regs(current);
1853
1854         __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
1855 }
1856
1857 extern int ftrace_profile_enable(int);
1858 extern void ftrace_profile_disable(int);
1859
1860 static void tp_perf_counter_destroy(struct perf_counter *counter)
1861 {
1862         ftrace_profile_disable(perf_event_id(&counter->hw_event));
1863 }
1864
1865 static const struct hw_perf_counter_ops *
1866 tp_perf_counter_init(struct perf_counter *counter)
1867 {
1868         int event_id = perf_event_id(&counter->hw_event);
1869         int ret;
1870
1871         ret = ftrace_profile_enable(event_id);
1872         if (ret)
1873                 return NULL;
1874
1875         counter->destroy = tp_perf_counter_destroy;
1876         counter->hw.irq_period = counter->hw_event.irq_period;
1877
1878         return &perf_ops_generic;
1879 }
1880 #else
1881 static const struct hw_perf_counter_ops *
1882 tp_perf_counter_init(struct perf_counter *counter)
1883 {
1884         return NULL;
1885 }
1886 #endif
1887
1888 static const struct hw_perf_counter_ops *
1889 sw_perf_counter_init(struct perf_counter *counter)
1890 {
1891         struct perf_counter_hw_event *hw_event = &counter->hw_event;
1892         const struct hw_perf_counter_ops *hw_ops = NULL;
1893         struct hw_perf_counter *hwc = &counter->hw;
1894
1895         /*
1896          * Software counters (currently) can't in general distinguish
1897          * between user, kernel and hypervisor events.
1898          * However, context switches and cpu migrations are considered
1899          * to be kernel events, and page faults are never hypervisor
1900          * events.
1901          */
1902         switch (perf_event_id(&counter->hw_event)) {
1903         case PERF_COUNT_CPU_CLOCK:
1904                 hw_ops = &perf_ops_cpu_clock;
1905
1906                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1907                         hw_event->irq_period = 10000;
1908                 break;
1909         case PERF_COUNT_TASK_CLOCK:
1910                 /*
1911                  * If the user instantiates this as a per-cpu counter,
1912                  * use the cpu_clock counter instead.
1913                  */
1914                 if (counter->ctx->task)
1915                         hw_ops = &perf_ops_task_clock;
1916                 else
1917                         hw_ops = &perf_ops_cpu_clock;
1918
1919                 if (hw_event->irq_period && hw_event->irq_period < 10000)
1920                         hw_event->irq_period = 10000;
1921                 break;
1922         case PERF_COUNT_PAGE_FAULTS:
1923         case PERF_COUNT_PAGE_FAULTS_MIN:
1924         case PERF_COUNT_PAGE_FAULTS_MAJ:
1925         case PERF_COUNT_CONTEXT_SWITCHES:
1926                 hw_ops = &perf_ops_generic;
1927                 break;
1928         case PERF_COUNT_CPU_MIGRATIONS:
1929                 if (!counter->hw_event.exclude_kernel)
1930                         hw_ops = &perf_ops_cpu_migrations;
1931                 break;
1932         }
1933
1934         if (hw_ops)
1935                 hwc->irq_period = hw_event->irq_period;
1936
1937         return hw_ops;
1938 }
1939
1940 /*
1941  * Allocate and initialize a counter structure
1942  */
1943 static struct perf_counter *
1944 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
1945                    int cpu,
1946                    struct perf_counter_context *ctx,
1947                    struct perf_counter *group_leader,
1948                    gfp_t gfpflags)
1949 {
1950         const struct hw_perf_counter_ops *hw_ops;
1951         struct perf_counter *counter;
1952
1953         counter = kzalloc(sizeof(*counter), gfpflags);
1954         if (!counter)
1955                 return NULL;
1956
1957         /*
1958          * Single counters are their own group leaders, with an
1959          * empty sibling list:
1960          */
1961         if (!group_leader)
1962                 group_leader = counter;
1963
1964         mutex_init(&counter->mutex);
1965         INIT_LIST_HEAD(&counter->list_entry);
1966         INIT_LIST_HEAD(&counter->event_entry);
1967         INIT_LIST_HEAD(&counter->sibling_list);
1968         init_waitqueue_head(&counter->waitq);
1969
1970         INIT_LIST_HEAD(&counter->child_list);
1971
1972         counter->irqdata                = &counter->data[0];
1973         counter->usrdata                = &counter->data[1];
1974         counter->cpu                    = cpu;
1975         counter->hw_event               = *hw_event;
1976         counter->wakeup_pending         = 0;
1977         counter->group_leader           = group_leader;
1978         counter->hw_ops                 = NULL;
1979         counter->ctx                    = ctx;
1980
1981         counter->state = PERF_COUNTER_STATE_INACTIVE;
1982         if (hw_event->disabled)
1983                 counter->state = PERF_COUNTER_STATE_OFF;
1984
1985         hw_ops = NULL;
1986
1987         if (perf_event_raw(hw_event)) {
1988                 hw_ops = hw_perf_counter_init(counter);
1989                 goto done;
1990         }
1991
1992         switch (perf_event_type(hw_event)) {
1993         case PERF_TYPE_HARDWARE:
1994                 hw_ops = hw_perf_counter_init(counter);
1995                 break;
1996
1997         case PERF_TYPE_SOFTWARE:
1998                 hw_ops = sw_perf_counter_init(counter);
1999                 break;
2000
2001         case PERF_TYPE_TRACEPOINT:
2002                 hw_ops = tp_perf_counter_init(counter);
2003                 break;
2004         }
2005
2006         if (!hw_ops) {
2007                 kfree(counter);
2008                 return NULL;
2009         }
2010 done:
2011         counter->hw_ops = hw_ops;
2012
2013         return counter;
2014 }
2015
2016 /**
2017  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
2018  *
2019  * @hw_event_uptr:      event type attributes for monitoring/sampling
2020  * @pid:                target pid
2021  * @cpu:                target cpu
2022  * @group_fd:           group leader counter fd
2023  */
2024 SYSCALL_DEFINE5(perf_counter_open,
2025                 const struct perf_counter_hw_event __user *, hw_event_uptr,
2026                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
2027 {
2028         struct perf_counter *counter, *group_leader;
2029         struct perf_counter_hw_event hw_event;
2030         struct perf_counter_context *ctx;
2031         struct file *counter_file = NULL;
2032         struct file *group_file = NULL;
2033         int fput_needed = 0;
2034         int fput_needed2 = 0;
2035         int ret;
2036
2037         /* for future expandability... */
2038         if (flags)
2039                 return -EINVAL;
2040
2041         if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
2042                 return -EFAULT;
2043
2044         /*
2045          * Get the target context (task or percpu):
2046          */
2047         ctx = find_get_context(pid, cpu);
2048         if (IS_ERR(ctx))
2049                 return PTR_ERR(ctx);
2050
2051         /*
2052          * Look up the group leader (we will attach this counter to it):
2053          */
2054         group_leader = NULL;
2055         if (group_fd != -1) {
2056                 ret = -EINVAL;
2057                 group_file = fget_light(group_fd, &fput_needed);
2058                 if (!group_file)
2059                         goto err_put_context;
2060                 if (group_file->f_op != &perf_fops)
2061                         goto err_put_context;
2062
2063                 group_leader = group_file->private_data;
2064                 /*
2065                  * Do not allow a recursive hierarchy (this new sibling
2066                  * becoming part of another group-sibling):
2067                  */
2068                 if (group_leader->group_leader != group_leader)
2069                         goto err_put_context;
2070                 /*
2071                  * Do not allow to attach to a group in a different
2072                  * task or CPU context:
2073                  */
2074                 if (group_leader->ctx != ctx)
2075                         goto err_put_context;
2076                 /*
2077                  * Only a group leader can be exclusive or pinned
2078                  */
2079                 if (hw_event.exclusive || hw_event.pinned)
2080                         goto err_put_context;
2081         }
2082
2083         ret = -EINVAL;
2084         counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
2085                                      GFP_KERNEL);
2086         if (!counter)
2087                 goto err_put_context;
2088
2089         ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
2090         if (ret < 0)
2091                 goto err_free_put_context;
2092
2093         counter_file = fget_light(ret, &fput_needed2);
2094         if (!counter_file)
2095                 goto err_free_put_context;
2096
2097         counter->filp = counter_file;
2098         mutex_lock(&ctx->mutex);
2099         perf_install_in_context(ctx, counter, cpu);
2100         mutex_unlock(&ctx->mutex);
2101
2102         fput_light(counter_file, fput_needed2);
2103
2104 out_fput:
2105         fput_light(group_file, fput_needed);
2106
2107         return ret;
2108
2109 err_free_put_context:
2110         kfree(counter);
2111
2112 err_put_context:
2113         put_context(ctx);
2114
2115         goto out_fput;
2116 }
2117
2118 /*
2119  * Initialize the perf_counter context in a task_struct:
2120  */
2121 static void
2122 __perf_counter_init_context(struct perf_counter_context *ctx,
2123                             struct task_struct *task)
2124 {
2125         memset(ctx, 0, sizeof(*ctx));
2126         spin_lock_init(&ctx->lock);
2127         mutex_init(&ctx->mutex);
2128         INIT_LIST_HEAD(&ctx->counter_list);
2129         INIT_LIST_HEAD(&ctx->event_list);
2130         ctx->task = task;
2131 }
2132
2133 /*
2134  * inherit a counter from parent task to child task:
2135  */
2136 static struct perf_counter *
2137 inherit_counter(struct perf_counter *parent_counter,
2138               struct task_struct *parent,
2139               struct perf_counter_context *parent_ctx,
2140               struct task_struct *child,
2141               struct perf_counter *group_leader,
2142               struct perf_counter_context *child_ctx)
2143 {
2144         struct perf_counter *child_counter;
2145
2146         /*
2147          * Instead of creating recursive hierarchies of counters,
2148          * we link inherited counters back to the original parent,
2149          * which has a filp for sure, which we use as the reference
2150          * count:
2151          */
2152         if (parent_counter->parent)
2153                 parent_counter = parent_counter->parent;
2154
2155         child_counter = perf_counter_alloc(&parent_counter->hw_event,
2156                                            parent_counter->cpu, child_ctx,
2157                                            group_leader, GFP_KERNEL);
2158         if (!child_counter)
2159                 return NULL;
2160
2161         /*
2162          * Link it up in the child's context:
2163          */
2164         child_counter->task = child;
2165         list_add_counter(child_counter, child_ctx);
2166         child_ctx->nr_counters++;
2167
2168         child_counter->parent = parent_counter;
2169         /*
2170          * inherit into child's child as well:
2171          */
2172         child_counter->hw_event.inherit = 1;
2173
2174         /*
2175          * Get a reference to the parent filp - we will fput it
2176          * when the child counter exits. This is safe to do because
2177          * we are in the parent and we know that the filp still
2178          * exists and has a nonzero count:
2179          */
2180         atomic_long_inc(&parent_counter->filp->f_count);
2181
2182         /*
2183          * Link this into the parent counter's child list
2184          */
2185         mutex_lock(&parent_counter->mutex);
2186         list_add_tail(&child_counter->child_list, &parent_counter->child_list);
2187
2188         /*
2189          * Make the child state follow the state of the parent counter,
2190          * not its hw_event.disabled bit.  We hold the parent's mutex,
2191          * so we won't race with perf_counter_{en,dis}able_family.
2192          */
2193         if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
2194                 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
2195         else
2196                 child_counter->state = PERF_COUNTER_STATE_OFF;
2197
2198         mutex_unlock(&parent_counter->mutex);
2199
2200         return child_counter;
2201 }
2202
2203 static int inherit_group(struct perf_counter *parent_counter,
2204               struct task_struct *parent,
2205               struct perf_counter_context *parent_ctx,
2206               struct task_struct *child,
2207               struct perf_counter_context *child_ctx)
2208 {
2209         struct perf_counter *leader;
2210         struct perf_counter *sub;
2211
2212         leader = inherit_counter(parent_counter, parent, parent_ctx,
2213                                  child, NULL, child_ctx);
2214         if (!leader)
2215                 return -ENOMEM;
2216         list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
2217                 if (!inherit_counter(sub, parent, parent_ctx,
2218                                      child, leader, child_ctx))
2219                         return -ENOMEM;
2220         }
2221         return 0;
2222 }
2223
2224 static void sync_child_counter(struct perf_counter *child_counter,
2225                                struct perf_counter *parent_counter)
2226 {
2227         u64 parent_val, child_val;
2228
2229         parent_val = atomic64_read(&parent_counter->count);
2230         child_val = atomic64_read(&child_counter->count);
2231
2232         /*
2233          * Add back the child's count to the parent's count:
2234          */
2235         atomic64_add(child_val, &parent_counter->count);
2236
2237         /*
2238          * Remove this counter from the parent's list
2239          */
2240         mutex_lock(&parent_counter->mutex);
2241         list_del_init(&child_counter->child_list);
2242         mutex_unlock(&parent_counter->mutex);
2243
2244         /*
2245          * Release the parent counter, if this was the last
2246          * reference to it.
2247          */
2248         fput(parent_counter->filp);
2249 }
2250
2251 static void
2252 __perf_counter_exit_task(struct task_struct *child,
2253                          struct perf_counter *child_counter,
2254                          struct perf_counter_context *child_ctx)
2255 {
2256         struct perf_counter *parent_counter;
2257         struct perf_counter *sub, *tmp;
2258
2259         /*
2260          * If we do not self-reap then we have to wait for the
2261          * child task to unschedule (it will happen for sure),
2262          * so that its counter is at its final count. (This
2263          * condition triggers rarely - child tasks usually get
2264          * off their CPU before the parent has a chance to
2265          * get this far into the reaping action)
2266          */
2267         if (child != current) {
2268                 wait_task_inactive(child, 0);
2269                 list_del_init(&child_counter->list_entry);
2270         } else {
2271                 struct perf_cpu_context *cpuctx;
2272                 unsigned long flags;
2273                 u64 perf_flags;
2274
2275                 /*
2276                  * Disable and unlink this counter.
2277                  *
2278                  * Be careful about zapping the list - IRQ/NMI context
2279                  * could still be processing it:
2280                  */
2281                 curr_rq_lock_irq_save(&flags);
2282                 perf_flags = hw_perf_save_disable();
2283
2284                 cpuctx = &__get_cpu_var(perf_cpu_context);
2285
2286                 group_sched_out(child_counter, cpuctx, child_ctx);
2287
2288                 list_del_init(&child_counter->list_entry);
2289
2290                 child_ctx->nr_counters--;
2291
2292                 hw_perf_restore(perf_flags);
2293                 curr_rq_unlock_irq_restore(&flags);
2294         }
2295
2296         parent_counter = child_counter->parent;
2297         /*
2298          * It can happen that parent exits first, and has counters
2299          * that are still around due to the child reference. These
2300          * counters need to be zapped - but otherwise linger.
2301          */
2302         if (parent_counter) {
2303                 sync_child_counter(child_counter, parent_counter);
2304                 list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
2305                                          list_entry) {
2306                         if (sub->parent) {
2307                                 sync_child_counter(sub, sub->parent);
2308                                 free_counter(sub);
2309                         }
2310                 }
2311                 free_counter(child_counter);
2312         }
2313 }
2314
2315 /*
2316  * When a child task exits, feed back counter values to parent counters.
2317  *
2318  * Note: we may be running in child context, but the PID is not hashed
2319  * anymore so new counters will not be added.
2320  */
2321 void perf_counter_exit_task(struct task_struct *child)
2322 {
2323         struct perf_counter *child_counter, *tmp;
2324         struct perf_counter_context *child_ctx;
2325
2326         child_ctx = &child->perf_counter_ctx;
2327
2328         if (likely(!child_ctx->nr_counters))
2329                 return;
2330
2331         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
2332                                  list_entry)
2333                 __perf_counter_exit_task(child, child_counter, child_ctx);
2334 }
2335
2336 /*
2337  * Initialize the perf_counter context in task_struct
2338  */
2339 void perf_counter_init_task(struct task_struct *child)
2340 {
2341         struct perf_counter_context *child_ctx, *parent_ctx;
2342         struct perf_counter *counter;
2343         struct task_struct *parent = current;
2344
2345         child_ctx  =  &child->perf_counter_ctx;
2346         parent_ctx = &parent->perf_counter_ctx;
2347
2348         __perf_counter_init_context(child_ctx, child);
2349
2350         /*
2351          * This is executed from the parent task context, so inherit
2352          * counters that have been marked for cloning:
2353          */
2354
2355         if (likely(!parent_ctx->nr_counters))
2356                 return;
2357
2358         /*
2359          * Lock the parent list. No need to lock the child - not PID
2360          * hashed yet and not running, so nobody can access it.
2361          */
2362         mutex_lock(&parent_ctx->mutex);
2363
2364         /*
2365          * We dont have to disable NMIs - we are only looking at
2366          * the list, not manipulating it:
2367          */
2368         list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
2369                 if (!counter->hw_event.inherit)
2370                         continue;
2371
2372                 if (inherit_group(counter, parent,
2373                                   parent_ctx, child, child_ctx))
2374                         break;
2375         }
2376
2377         mutex_unlock(&parent_ctx->mutex);
2378 }
2379
2380 static void __cpuinit perf_counter_init_cpu(int cpu)
2381 {
2382         struct perf_cpu_context *cpuctx;
2383
2384         cpuctx = &per_cpu(perf_cpu_context, cpu);
2385         __perf_counter_init_context(&cpuctx->ctx, NULL);
2386
2387         mutex_lock(&perf_resource_mutex);
2388         cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
2389         mutex_unlock(&perf_resource_mutex);
2390
2391         hw_perf_counter_setup(cpu);
2392 }
2393
2394 #ifdef CONFIG_HOTPLUG_CPU
2395 static void __perf_counter_exit_cpu(void *info)
2396 {
2397         struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
2398         struct perf_counter_context *ctx = &cpuctx->ctx;
2399         struct perf_counter *counter, *tmp;
2400
2401         list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
2402                 __perf_counter_remove_from_context(counter);
2403 }
2404 static void perf_counter_exit_cpu(int cpu)
2405 {
2406         struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
2407         struct perf_counter_context *ctx = &cpuctx->ctx;
2408
2409         mutex_lock(&ctx->mutex);
2410         smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
2411         mutex_unlock(&ctx->mutex);
2412 }
2413 #else
2414 static inline void perf_counter_exit_cpu(int cpu) { }
2415 #endif
2416
2417 static int __cpuinit
2418 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
2419 {
2420         unsigned int cpu = (long)hcpu;
2421
2422         switch (action) {
2423
2424         case CPU_UP_PREPARE:
2425         case CPU_UP_PREPARE_FROZEN:
2426                 perf_counter_init_cpu(cpu);
2427                 break;
2428
2429         case CPU_DOWN_PREPARE:
2430         case CPU_DOWN_PREPARE_FROZEN:
2431                 perf_counter_exit_cpu(cpu);
2432                 break;
2433
2434         default:
2435                 break;
2436         }
2437
2438         return NOTIFY_OK;
2439 }
2440
2441 static struct notifier_block __cpuinitdata perf_cpu_nb = {
2442         .notifier_call          = perf_cpu_notify,
2443 };
2444
2445 static int __init perf_counter_init(void)
2446 {
2447         perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
2448                         (void *)(long)smp_processor_id());
2449         register_cpu_notifier(&perf_cpu_nb);
2450
2451         return 0;
2452 }
2453 early_initcall(perf_counter_init);
2454
2455 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
2456 {
2457         return sprintf(buf, "%d\n", perf_reserved_percpu);
2458 }
2459
2460 static ssize_t
2461 perf_set_reserve_percpu(struct sysdev_class *class,
2462                         const char *buf,
2463                         size_t count)
2464 {
2465         struct perf_cpu_context *cpuctx;
2466         unsigned long val;
2467         int err, cpu, mpt;
2468
2469         err = strict_strtoul(buf, 10, &val);
2470         if (err)
2471                 return err;
2472         if (val > perf_max_counters)
2473                 return -EINVAL;
2474
2475         mutex_lock(&perf_resource_mutex);
2476         perf_reserved_percpu = val;
2477         for_each_online_cpu(cpu) {
2478                 cpuctx = &per_cpu(perf_cpu_context, cpu);
2479                 spin_lock_irq(&cpuctx->ctx.lock);
2480                 mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
2481                           perf_max_counters - perf_reserved_percpu);
2482                 cpuctx->max_pertask = mpt;
2483                 spin_unlock_irq(&cpuctx->ctx.lock);
2484         }
2485         mutex_unlock(&perf_resource_mutex);
2486
2487         return count;
2488 }
2489
2490 static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
2491 {
2492         return sprintf(buf, "%d\n", perf_overcommit);
2493 }
2494
2495 static ssize_t
2496 perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
2497 {
2498         unsigned long val;
2499         int err;
2500
2501         err = strict_strtoul(buf, 10, &val);
2502         if (err)
2503                 return err;
2504         if (val > 1)
2505                 return -EINVAL;
2506
2507         mutex_lock(&perf_resource_mutex);
2508         perf_overcommit = val;
2509         mutex_unlock(&perf_resource_mutex);
2510
2511         return count;
2512 }
2513
2514 static SYSDEV_CLASS_ATTR(
2515                                 reserve_percpu,
2516                                 0644,
2517                                 perf_show_reserve_percpu,
2518                                 perf_set_reserve_percpu
2519                         );
2520
2521 static SYSDEV_CLASS_ATTR(
2522                                 overcommit,
2523                                 0644,
2524                                 perf_show_overcommit,
2525                                 perf_set_overcommit
2526                         );
2527
2528 static struct attribute *perfclass_attrs[] = {
2529         &attr_reserve_percpu.attr,
2530         &attr_overcommit.attr,
2531         NULL
2532 };
2533
2534 static struct attribute_group perfclass_attr_group = {
2535         .attrs                  = perfclass_attrs,
2536         .name                   = "perf_counters",
2537 };
2538
2539 static int __init perf_counter_sysfs_init(void)
2540 {
2541         return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
2542                                   &perfclass_attr_group);
2543 }
2544 device_initcall(perf_counter_sysfs_init);