drm/i915/guc: reset GuC and retry on firmware load failure
authorArun Siluvery <arun.siluvery@linux.intel.com>
Mon, 4 Apr 2016 17:50:56 +0000 (18:50 +0100)
committerTvrtko Ursulin <tvrtko.ursulin@intel.com>
Tue, 5 Apr 2016 12:29:24 +0000 (13:29 +0100)
Due to timing issues in the HW, some of the status bits required for GuC
authentication occasionally don't get set; when that happens, the GuC
cannot be initialized and we will be left with a wedged GPU. The W/A
suggested is to perform a soft reset of the GuC and attempt to reload
the F/W again for few times before giving up.

As the failure is dependent on timing, tests performed by triggering
manual full gpu reset (i915_wedged) showed that we could sometimes hit
this after several thousand iterations, but sometimes tests ran even
longer without any issues. Reset and reload mechanism proved helpful
when we indeed hit f/w load failure, so it is better to include this
to improve driver stability.

This change implements the following WAs,

WaEnableuKernelHeaderValidFix:skl,bxt
WaEnableGuCBootHashCheckNotSet:skl,bxt

Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Reviewed-by: Alex Dai <yu.dai@intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_guc_reg.h
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/intel_guc_loader.c
drivers/gpu/drm/i915/intel_uncore.c

index 6443745d4182782701100f0218e85696d4a1dc4b..466b8b68f467c8fd31ea1b602be0915ab52145a9 100644 (file)
@@ -2747,6 +2747,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_device *dev);
 extern int i915_reset(struct drm_device *dev);
+extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
index e4ba5822289bb6dcbc0a225c02e87d62fe48175a..94ceee5f05447e335bb80f36bc3d783879edfcf2 100644 (file)
@@ -27,6 +27,7 @@
 /* Definitions of GuC H/W registers, bits, etc */
 
 #define GUC_STATUS                     _MMIO(0xc000)
+#define   GS_MIA_IN_RESET              (1 << 0)
 #define   GS_BOOTROM_SHIFT             1
 #define   GS_BOOTROM_MASK                (0x7F << GS_BOOTROM_SHIFT)
 #define   GS_BOOTROM_RSA_FAILED                  (0x50 << GS_BOOTROM_SHIFT)
index 683274f274051038359e41c4a2a1cadbdeb2e7ee..30fea341ae66eeddd09b296ff1ab0151ca779fb7 100644 (file)
@@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN6_GRDOM_MEDIA              (1 << 2)
 #define  GEN6_GRDOM_BLT                        (1 << 3)
 #define  GEN6_GRDOM_VECS               (1 << 4)
+#define  GEN9_GRDOM_GUC                        (1 << 5)
 #define  GEN8_GRDOM_MEDIA2             (1 << 7)
 
 #define RING_PP_DIR_BASE(ring)         _MMIO((ring)->mmio_base+0x228)
index b4976f985369156043f87a3a0408ec2ef0e1d457..d84c5608f06837bfcc68fc15a5a8538ea8421b1e 100644 (file)
@@ -353,6 +353,24 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
        return ret;
 }
 
+static int i915_reset_guc(struct drm_i915_private *dev_priv)
+{
+       int ret;
+       u32 guc_status;
+
+       ret = intel_guc_reset(dev_priv);
+       if (ret) {
+               DRM_ERROR("GuC reset failed, ret = %d\n", ret);
+               return ret;
+       }
+
+       guc_status = I915_READ(GUC_STATUS);
+       WARN(!(guc_status & GS_MIA_IN_RESET),
+            "GuC status: 0x%x, MIA core expected to be in reset\n", guc_status);
+
+       return ret;
+}
+
 /**
  * intel_guc_ucode_load() - load GuC uCode into the device
  * @dev:       drm device
@@ -417,9 +435,36 @@ int intel_guc_ucode_load(struct drm_device *dev)
        if (err)
                goto fail;
 
+       /*
+        * WaEnableuKernelHeaderValidFix:skl,bxt
+        * For BXT, this is only upto B0 but below WA is required for later
+        * steppings also so this is extended as well.
+        */
+       /* WaEnableGuCBootHashCheckNotSet:skl,bxt */
        err = guc_ucode_xfer(dev_priv);
-       if (err)
-               goto fail;
+       if (err) {
+               int retries = 3;
+
+               DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err);
+
+               while (retries--) {
+                       err = i915_reset_guc(dev_priv);
+                       if (err)
+                               break;
+
+                       err = guc_ucode_xfer(dev_priv);
+                       if (!err) {
+                               DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n");
+                               break;
+                       }
+                       DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries);
+               }
+
+               if (err) {
+                       DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err);
+                       goto fail;
+               }
+       }
 
        guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS;
 
index ac1c545436afd4d43948698f692e6087291b8c99..fbc1d215ca67672abf7df50f69be58cf2f227178 100644 (file)
@@ -1673,6 +1673,25 @@ bool intel_has_gpu_reset(struct drm_device *dev)
        return intel_get_gpu_reset(dev) != NULL;
 }
 
+int intel_guc_reset(struct drm_i915_private *dev_priv)
+{
+       int ret;
+       unsigned long irqflags;
+
+       if (!i915.enable_guc_submission)
+               return -EINVAL;
+
+       intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+       spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
+
+       ret = gen6_hw_domain_reset(dev_priv, GEN9_GRDOM_GUC);
+
+       spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
+       intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+
+       return ret;
+}
+
 bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
 {
        return check_for_unclaimed_mmio(dev_priv);