drm/i915: Add bannable context parameter
authorMika Kuoppala <mika.kuoppala@linux.intel.com>
Wed, 16 Nov 2016 15:20:32 +0000 (17:20 +0200)
committerMika Kuoppala <mika.kuoppala@intel.com>
Mon, 21 Nov 2016 12:36:40 +0000 (14:36 +0200)
Now when driver has per context scoring of 'hanging badness'
and also subsequent hangs during short windows are allowed,
if there is progress made in between, it does not make sense
to expose a ban timing window as a context parameter anymore.

Let the scoring be the sole indicator for ban policy and substitute
ban period context parameter as a boolean to get/set context
bannable property.

v2: allow non root to opt into being banned (Chris)

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Mika Kuoppala <mika.kuoppala@intel.com>
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gem_context.c
drivers/gpu/drm/i915/i915_gpu_error.c
include/uapi/drm/i915_drm.h

index 691f0b694e779eaa59a92fa0647d57e000638459..7db7b37937e7d483c13e8fff0d3af9418d7f8cef 100644 (file)
@@ -850,6 +850,7 @@ struct drm_i915_error_state {
                        long jiffies;
                        pid_t pid;
                        u32 context;
+                       int ban_score;
                        u32 seqno;
                        u32 head;
                        u32 tail;
@@ -909,16 +910,10 @@ struct i915_ctx_hang_stats {
        /* This context had batch active when hang was declared */
        unsigned batch_active;
 
-       /* Time when this context was last blamed for a GPU reset */
-       unsigned long guilty_ts;
-
-       /* If the contexts causes a second GPU hang within this time,
-        * it is permanently banned from submitting any more work.
-        */
-       unsigned long ban_period_seconds;
+       bool bannable:1;
 
        /* This context is banned to submit more work */
-       bool banned;
+       bool banned:1;
 
 #define CONTEXT_SCORE_GUILTY           10
 #define CONTEXT_SCORE_BAN_THRESHOLD    40
@@ -1459,8 +1454,6 @@ struct i915_gpu_error {
        /* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
 #define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
-       /* Hang gpu twice in this window and your context gets banned */
-#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
 
        struct delayed_work hangcheck_work;
 
index 4c4aed2d2afb86faef721d1431d22fcd567fb393..99198146ecc86f067476d9c57abf7c7285bb7d88 100644 (file)
@@ -2623,20 +2623,13 @@ err_unlock:
 static bool i915_context_is_banned(const struct i915_gem_context *ctx)
 {
        const struct i915_ctx_hang_stats *hs = &ctx->hang_stats;
-       unsigned long elapsed;
 
        if (hs->banned)
                return true;
 
-       if (!hs->ban_period_seconds)
+       if (!hs->bannable)
                return false;
 
-       elapsed = get_seconds() - hs->guilty_ts;
-       if (elapsed <= hs->ban_period_seconds) {
-               DRM_DEBUG("context hanging too fast, banning!\n");
-               return true;
-       }
-
        if (hs->ban_score >= CONTEXT_SCORE_BAN_THRESHOLD) {
                DRM_DEBUG("context hanging too often, banning!\n");
                return true;
@@ -2653,7 +2646,6 @@ static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 
        hs->banned = i915_context_is_banned(ctx);
        hs->batch_active++;
-       hs->guilty_ts = get_seconds();
 }
 
 static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
index d95dfec7166ee74b525198116f6304f31bee8041..97012373a8d57080b879539127d5e8d74ed22517 100644 (file)
@@ -331,7 +331,7 @@ __create_hw_context(struct drm_device *dev,
         * is no remap info, it will be a NOP. */
        ctx->remap_slice = ALL_L3_SLICES(dev_priv);
 
-       ctx->hang_stats.ban_period_seconds = DRM_I915_CTX_BAN_PERIOD;
+       ctx->hang_stats.bannable = true;
        ctx->ring_size = 4 * PAGE_SIZE;
        ctx->desc_template = GEN8_CTX_ADDRESSING_MODE(dev_priv) <<
                             GEN8_CTX_ADDRESSING_MODE_SHIFT;
@@ -1085,7 +1085,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
        args->size = 0;
        switch (args->param) {
        case I915_CONTEXT_PARAM_BAN_PERIOD:
-               args->value = ctx->hang_stats.ban_period_seconds;
+               ret = -EINVAL;
                break;
        case I915_CONTEXT_PARAM_NO_ZEROMAP:
                args->value = ctx->flags & CONTEXT_NO_ZEROMAP;
@@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
        case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
                args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
                break;
+       case I915_CONTEXT_PARAM_BANNABLE:
+               args->value = ctx->hang_stats.bannable;
+               break;
        default:
                ret = -EINVAL;
                break;
@@ -1130,13 +1133,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 
        switch (args->param) {
        case I915_CONTEXT_PARAM_BAN_PERIOD:
-               if (args->size)
-                       ret = -EINVAL;
-               else if (args->value < ctx->hang_stats.ban_period_seconds &&
-                        !capable(CAP_SYS_ADMIN))
-                       ret = -EPERM;
-               else
-                       ctx->hang_stats.ban_period_seconds = args->value;
+               ret = -EINVAL;
                break;
        case I915_CONTEXT_PARAM_NO_ZEROMAP:
                if (args->size) {
@@ -1156,6 +1153,14 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
                                ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
                }
                break;
+       case I915_CONTEXT_PARAM_BANNABLE:
+               if (args->size)
+                       ret = -EINVAL;
+               else if (!capable(CAP_SYS_ADMIN) && !args->value)
+                       ret = -EPERM;
+               else
+                       ctx->hang_stats.bannable = args->value;
+               break;
        default:
                ret = -EINVAL;
                break;
index d5a4ec9b507ff4956cb3b1dd76cd251b8fe6d885..fa988a00ae68b11e53054f4853a81fa7abf39bb0 100644 (file)
@@ -352,8 +352,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
        if (!erq->seqno)
                return;
 
-       err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
-                  prefix, erq->pid,
+       err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
+                  prefix, erq->pid, erq->ban_score,
                   erq->context, erq->seqno,
                   jiffies_to_msecs(jiffies - erq->jiffies),
                   erq->head, erq->tail);
@@ -1170,6 +1170,7 @@ static void record_request(struct drm_i915_gem_request *request,
                           struct drm_i915_error_request *erq)
 {
        erq->context = request->ctx->hw_id;
+       erq->ban_score = request->ctx->hang_stats.ban_score;
        erq->seqno = request->global_seqno;
        erq->jiffies = request->emitted_jiffies;
        erq->head = request->head;
index 1c12a350eca391a1442acc40f6a75728ea9b11f9..12003f0d8c7f3dd9d19e5bc72a6f85696f0e9197 100644 (file)
@@ -1224,6 +1224,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_NO_ZEROMAP  0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE    0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE    0x4
+#define I915_CONTEXT_PARAM_BANNABLE    0x5
        __u64 value;
 };