/*
* Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
*/
static bool __match_event(struct perf_event *a, struct perf_event *b)
{
+ /* Per-cpu and task events don't mix */
if ((a->attach_state & PERF_ATTACH_TASK) !=
(b->attach_state & PERF_ATTACH_TASK))
return false;
- /* not task */
+#ifdef CONFIG_CGROUP_PERF
+ if (a->cgrp != b->cgrp)
+ return false;
+#endif
+
+ /* If not task event, we're machine wide */
+ if (!(b->attach_state & PERF_ATTACH_TASK))
+ return true;
+
+ /*
+ * Events that target same task are placed into the same cache group.
+ */
+ if (a->hw.cqm_target == b->hw.cqm_target)
+ return true;
+
+ /*
+ * Are we an inherited event?
+ */
+ if (b->parent == a)
+ return true;
+
+ return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+ if (event->attach_state & PERF_ATTACH_TASK)
+ return perf_cgroup_from_task(event->hw.cqm_target);
- return true; /* if not task, we're machine wide */
+ return event->cgrp;
}
+#endif
/*
* Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ * PROHIBITS
+ * system-wide -> cgroup and task
+ * cgroup -> system-wide
+ * -> task in cgroup
+ * task -> system-wide
+ * -> task in cgroup
+ *
+ * Call this function before allocating an RMID.
*/
static bool __conflict_event(struct perf_event *a, struct perf_event *b)
{
+#ifdef CONFIG_CGROUP_PERF
+ /*
+ * We can have any number of cgroups but only one system-wide
+ * event at a time.
+ */
+ if (a->cgrp && b->cgrp) {
+ struct perf_cgroup *ac = a->cgrp;
+ struct perf_cgroup *bc = b->cgrp;
+
+ /*
+ * This condition should have been caught in
+ * __match_event() and we should be sharing an RMID.
+ */
+ WARN_ON_ONCE(ac == bc);
+
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+
+ if (a->cgrp || b->cgrp) {
+ struct perf_cgroup *ac, *bc;
+
+ /*
+ * cgroup and system-wide events are mutually exclusive
+ */
+ if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+ (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+ return true;
+
+ /*
+ * Ensure neither event is part of the other's cgroup
+ */
+ ac = event_to_cgroup(a);
+ bc = event_to_cgroup(b);
+ if (ac == bc)
+ return true;
+
+ /*
+ * Must have cgroup and non-intersecting task events.
+ */
+ if (!ac || !bc)
+ return false;
+
+ /*
+ * We have cgroup and task events, and the task belongs
+ * to a cgroup. Check for for overlap.
+ */
+ if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+ cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+ return true;
+
+ return false;
+ }
+#endif
/*
* If one of them is not a task, same story as above with cgroups.
*/
static void intel_cqm_event_read(struct perf_event *event)
{
- unsigned long rmid = event->hw.cqm_rmid;
+ unsigned long rmid;
u64 val;
+ /*
+ * Task events are handled by intel_cqm_event_count().
+ */
+ if (event->cpu == -1)
+ return;
+
+ rmid = event->hw.cqm_rmid;
val = __rmid_read(rmid);
/*
local64_set(&event->count, val);
}
+struct rmid_read {
+ unsigned int rmid;
+ atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info)
+{
+ struct rmid_read *rr = info;
+ u64 val;
+
+ val = __rmid_read(rr->rmid);
+
+ if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+ return;
+
+ atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+ return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+ struct rmid_read rr = {
+ .rmid = event->hw.cqm_rmid,
+ .value = ATOMIC64_INIT(0),
+ };
+
+ /*
+ * We only need to worry about task events. System-wide events
+ * are handled like usual, i.e. entirely with
+ * intel_cqm_event_read().
+ */
+ if (event->cpu != -1)
+ return __perf_event_count(event);
+
+ /*
+ * Only the group leader gets to report values. This stops us
+ * reporting duplicate values to userspace, and gives us a clear
+ * rule for which task gets to report the values.
+ *
+ * Note that it is impossible to attribute these values to
+ * specific packages - we forfeit that ability when we create
+ * task events.
+ */
+ if (!cqm_group_leader(event))
+ return 0;
+
+ on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+ local64_set(&event->count, atomic64_read(&rr.value));
+
+ return __perf_event_count(event);
+}
+
static void intel_cqm_event_start(struct perf_event *event, int mode)
{
struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
/*
* And we're the group leader..
*/
- if (!list_empty(&event->hw.cqm_groups_entry)) {
+ if (cqm_group_leader(event)) {
/*
* If there was a group_other, make that leader, otherwise
* destroy the group and return the RMID.
static struct pmu intel_cqm_pmu;
-/*
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_cqm_event_read().
- */
static int intel_cqm_event_init(struct perf_event *event)
{
struct perf_event *group = NULL;
if (event->attr.config & ~QOS_EVENT_MASK)
return -EINVAL;
- if (event->cpu == -1)
- return -EINVAL;
-
/* unsupported modes and filters */
if (event->attr.exclude_user ||
event->attr.exclude_kernel ||
mutex_lock(&cache_mutex);
- err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+ /* Will also set rmid */
+ err = intel_cqm_setup_event(event, &group);
if (err)
goto out;
.start = intel_cqm_event_start,
.stop = intel_cqm_event_stop,
.read = intel_cqm_event_read,
+ .count = intel_cqm_event_count,
};
static inline void cqm_pick_event_reader(int cpu)
__perf_cpu_notifier(intel_cqm_cpu_notifier);
- ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-
+ ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
+ PERF_TYPE_INTEL_CQM);
if (ret)
pr_err("Intel CQM perf registration failed: %d\n", ret);
else