drm/i915: Restore context and pd for ringbuffer submission after reset

author Chris Wilson <chris@chris-wilson.co.uk>

Tue, 7 Feb 2017 15:24:37 +0000 (15:24 +0000)

committer Jani Nikula <jani.nikula@intel.com>

Thu, 16 Feb 2017 09:59:11 +0000 (11:59 +0200)
author Chris Wilson <chris@chris-wilson.co.uk>
Tue, 7 Feb 2017 15:24:37 +0000 (15:24 +0000)
committer Jani Nikula <jani.nikula@intel.com>
Thu, 16 Feb 2017 09:59:11 +0000 (11:59 +0200)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c

index c8689892a89fc8218ca43ec1be7ffc9724bee5e0..c7eba361c14dce87ba7575fecad913ecc12a6efa 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2735,21 +2735,17 @@ static void i915_gem_reset_engine(struct intel_engine_cs *engine)
                 engine->irq_seqno_barrier(engine);
  
         request = i915_gem_find_active_request(engine);
-       if (!request)
-               return;
-
-       if (!i915_gem_reset_request(request))
-               return;
+       if (request && i915_gem_reset_request(request)) {
+               DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
+                                engine->name, request->global_seqno);
  
-       DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
-                        engine->name, request->global_seqno);
+               /* If this context is now banned, skip all pending requests. */
+               if (i915_gem_context_is_banned(request->ctx))
+                       engine_skip_context(request);
+       }
  
         /* Setup the CS to resume from the breadcrumb of the hung request */
         engine->reset_hw(engine, request);
-
-       /* If this context is now banned, skip all of its pending requests. */
-       if (i915_gem_context_is_banned(request->ctx))
-               engine_skip_context(request);
  }
  
  void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h

index 72f9f36ae5ce57c22edc1f31064433d635dd9a8a..675323189f2c50077cac2ff0fa79b2c7c14f4a10 100644 (file)
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3307,8 +3307,10 @@ enum skl_disp_power_wells {
  /*
   * Logical Context regs
   */
-#define CCID                   _MMIO(0x2180)
-#define   CCID_EN              (1<<0)
+#define CCID                           _MMIO(0x2180)
+#define   CCID_EN                      BIT(0)
+#define   CCID_EXTENDED_STATE_RESTORE  BIT(2)
+#define   CCID_EXTENDED_STATE_SAVE     BIT(3)
  /*
   * Notes on SNB/IVB/VLV context size:
   * - Power context is saved elsewhere (LLC or stolen)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c

index 2e767ebff08447b9275f109d4fa6e560dbae977b..ebf8023d21e6fba52c01b54d46fb3456b4709731 100644 (file)
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1390,7 +1390,20 @@ static void reset_common_ring(struct intel_engine_cs *engine,
  {
         struct drm_i915_private *dev_priv = engine->i915;
         struct execlist_port *port = engine->execlist_port;
-       struct intel_context *ce = &request->ctx->engine[engine->id];
+       struct intel_context *ce;
+
+       /* If the request was innocent, we leave the request in the ELSP
+        * and will try to replay it on restarting. The context image may
+        * have been corrupted by the reset, in which case we may have
+        * to service a new GPU hang, but more likely we can continue on
+        * without impact.
+        *
+        * If the request was guilty, we presume the context is corrupt
+        * and have to at least restore the RING register in the context
+        * image back to the expected values to skip over the guilty request.
+        */
+       if (!request || request->fence.error != -EIO)
+               return;
  
         /* We want a simple context + ring to execute the breadcrumb update.
          * We cannot rely on the context being intact across the GPU hang,
@@ -1399,6 +1412,7 @@ static void reset_common_ring(struct intel_engine_cs *engine,
          * future request will be after userspace has had the opportunity
          * to recreate its own state.
          */
+       ce = &request->ctx->engine[engine->id];
         execlists_init_reg_state(ce->lrc_reg_state,
                                  request->ctx, engine, ce->ring);
  
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c

index 69035e4f9b3b76a06c4705c9429cbe44c2093ba3..91bc4abf5d3e578ae9dffe2dd2adfd9c4305e31e 100644 (file)
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -599,10 +599,62 @@ out:
  static void reset_ring_common(struct intel_engine_cs *engine,
                               struct drm_i915_gem_request *request)
  {
-       struct intel_ring *ring = request->ring;
+       /* Try to restore the logical GPU state to match the continuation
+        * of the request queue. If we skip the context/PD restore, then
+        * the next request may try to execute assuming that its context
+        * is valid and loaded on the GPU and so may try to access invalid
+        * memory, prompting repeated GPU hangs.
+        *
+        * If the request was guilty, we still restore the logical state
+        * in case the next request requires it (e.g. the aliasing ppgtt),
+        * but skip over the hung batch.
+        *
+        * If the request was innocent, we try to replay the request with
+        * the restored context.
+        */
+       if (request) {
+               struct drm_i915_private *dev_priv = request->i915;
+               struct intel_context *ce = &request->ctx->engine[engine->id];
+               struct i915_hw_ppgtt *ppgtt;
+
+               /* FIXME consider gen8 reset */
+
+               if (ce->state) {
+                       I915_WRITE(CCID,
+                                  i915_ggtt_offset(ce->state) |
+                                  BIT(8) /* must be set! */ |
+                                  CCID_EXTENDED_STATE_SAVE |
+                                  CCID_EXTENDED_STATE_RESTORE |
+                                  CCID_EN);
+               }
  
-       ring->head = request->postfix;
-       ring->last_retired_head = -1;
+               ppgtt = request->ctx->ppgtt ?: engine->i915->mm.aliasing_ppgtt;
+               if (ppgtt) {
+                       u32 pd_offset = ppgtt->pd.base.ggtt_offset << 10;
+
+                       I915_WRITE(RING_PP_DIR_DCLV(engine), PP_DIR_DCLV_2G);
+                       I915_WRITE(RING_PP_DIR_BASE(engine), pd_offset);
+
+                       /* Wait for the PD reload to complete */
+                       if (intel_wait_for_register(dev_priv,
+                                                   RING_PP_DIR_BASE(engine),
+                                                   BIT(0), 0,
+                                                   10))
+                               DRM_ERROR("Wait for reload of ppgtt page-directory timed out\n");
+
+                       ppgtt->pd_dirty_rings &= ~intel_engine_flag(engine);
+               }
+
+               /* If the rq hung, jump to its breadcrumb and skip the batch */
+               if (request->fence.error == -EIO) {
+                       struct intel_ring *ring = request->ring;
+
+                       ring->head = request->postfix;
+                       ring->last_retired_head = -1;
+               }
+       } else {
+               engine->legacy_active_context = NULL;
+       }
  }
  
  static int intel_ring_workarounds_emit(struct drm_i915_gem_request *req)
author	Chris Wilson <chris@chris-wilson.co.uk>
	Tue, 7 Feb 2017 15:24:37 +0000 (15:24 +0000)
committer	Jani Nikula <jani.nikula@intel.com>
	Thu, 16 Feb 2017 09:59:11 +0000 (11:59 +0200)
drivers/gpu/drm/i915/i915_gem.c		patch \| blob \| blame \| history
drivers/gpu/drm/i915/i915_reg.h		patch \| blob \| blame \| history
drivers/gpu/drm/i915/intel_lrc.c		patch \| blob \| blame \| history
drivers/gpu/drm/i915/intel_ringbuffer.c		patch \| blob \| blame \| history