drm/i915/bdw: Apply workarounds in render ring init function
authorArun Siluvery <arun.siluvery@linux.intel.com>
Tue, 26 Aug 2014 13:44:50 +0000 (14:44 +0100)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Wed, 3 Sep 2014 09:04:42 +0000 (11:04 +0200)
For BDW workarounds are currently initialized in init_clock_gating() but
they are lost during reset, suspend/resume etc; this patch moves the WAs
that are part of register state context to render ring init fn otherwise
default context ends up with incorrect values as they don't get initialized
until init_clock_gating fn.

v2: Add workarounds to golden render state
This method has its own issues, first of all this is different for
each gen and it is generated using a tool so adding new workaround
and mainitaining them across gens is not a straightforward process.

v3: Use LRIs to emit these workarounds (Ville)
Instead of modifying the golden render state the same LRIs are
emitted from within the driver.

v4: Use abstract name when exporting gen specific routines (Chris)

For: VIZ-4092
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Reviewed-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
drivers/gpu/drm/i915/i915_gem_context.c
drivers/gpu/drm/i915/intel_pm.c
drivers/gpu/drm/i915/intel_ringbuffer.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index 62ee178b1edb0dc2750dbb86b2de36dfca26bc71..a5221d8f1580182fe80c7c9cfc34da7d3484fe11 100644 (file)
@@ -628,6 +628,12 @@ done:
        ring->last_context = to;
 
        if (uninitialized) {
+               if (ring->init_context) {
+                       ret = ring->init_context(ring);
+                       if (ret)
+                               DRM_ERROR("ring init context: %d\n", ret);
+               }
+
                ret = i915_gem_render_state_init(ring);
                if (ret)
                        DRM_ERROR("init render state: %d\n", ret);
index b9edfd426a19716bd3e8ee463851b7c5e1a3a581..718023859686fac3055c01e3a846dca33715f62b 100644 (file)
@@ -5536,37 +5536,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
        /* FIXME(BDW): Check all the w/a, some might only apply to
         * pre-production hw. */
 
-       /* WaDisablePartialInstShootdown:bdw */
-       I915_WRITE(GEN8_ROW_CHICKEN,
-                  _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE));
-
-       /* WaDisableThreadStallDopClockGating:bdw */
-       /* FIXME: Unclear whether we really need this on production bdw. */
-       I915_WRITE(GEN8_ROW_CHICKEN,
-                  _MASKED_BIT_ENABLE(STALL_DOP_GATING_DISABLE));
 
-       /*
-        * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
-        * pre-production hardware
-        */
-       I915_WRITE(HALF_SLICE_CHICKEN3,
-                  _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS));
-       I915_WRITE(HALF_SLICE_CHICKEN3,
-                  _MASKED_BIT_ENABLE(GEN8_SAMPLER_POWER_BYPASS_DIS));
        I915_WRITE(GAMTARBMODE, _MASKED_BIT_ENABLE(ARB_MODE_BWGTLB_DISABLE));
 
        I915_WRITE(_3D_CHICKEN3,
                   _MASKED_BIT_ENABLE(_3D_CHICKEN_SDE_LIMIT_FIFO_POLY_DEPTH(2)));
 
-       I915_WRITE(COMMON_SLICE_CHICKEN2,
-                  _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
-
-       I915_WRITE(GEN7_HALF_SLICE_CHICKEN1,
-                  _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
-
-       /* WaDisableDopClockGating:bdw May not be needed for production */
-       I915_WRITE(GEN7_ROW_CHICKEN2,
-                  _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
 
        /* WaSwitchSolVfFArbitrationPriority:bdw */
        I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL);
@@ -5582,31 +5557,12 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
                           BDW_DPRS_MASK_VBLANK_SRD);
        }
 
-       /* Use Force Non-Coherent whenever executing a 3D context. This is a
-        * workaround for for a possible hang in the unlikely event a TLB
-        * invalidation occurs during a PSD flush.
-        */
-       I915_WRITE(HDC_CHICKEN0,
-                  I915_READ(HDC_CHICKEN0) |
-                  _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
-
        /* WaVSRefCountFullforceMissDisable:bdw */
        /* WaDSRefCountFullforceMissDisable:bdw */
        I915_WRITE(GEN7_FF_THREAD_MODE,
                   I915_READ(GEN7_FF_THREAD_MODE) &
                   ~(GEN8_FF_DS_REF_CNT_FFME | GEN7_FF_VS_REF_CNT_FFME));
 
-       /*
-        * BSpec recommends 8x4 when MSAA is used,
-        * however in practice 16x4 seems fastest.
-        *
-        * Note that PS/WM thread counts depend on the WIZ hashing
-        * disable bit, which we don't touch here, but it's good
-        * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
-        */
-       I915_WRITE(GEN7_GT_MODE,
-                  GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
-
        I915_WRITE(GEN6_RC_SLEEP_PSMI_CONTROL,
                   _MASKED_BIT_ENABLE(GEN8_RC_SEMA_IDLE_MSG_DISABLE));
 
@@ -5614,10 +5570,6 @@ static void broadwell_init_clock_gating(struct drm_device *dev)
        I915_WRITE(GEN8_UCGCTL6, I915_READ(GEN8_UCGCTL6) |
                   GEN8_SDEUNIT_CLOCK_GATE_DISABLE);
 
-       /* Wa4x4STCOptimizationDisable:bdw */
-       I915_WRITE(CACHE_MODE_1,
-                  _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
-
        lpt_init_clock_gating(dev);
 }
 
index de7654623acc1db660cb63e41011987362b426bc..1d5bfdb4fe9787bd1c3507696e37b649d90dc7dd 100644 (file)
@@ -657,6 +657,84 @@ err:
        return ret;
 }
 
+static inline void intel_ring_emit_wa(struct intel_engine_cs *ring,
+                                      u32 addr, u32 value)
+{
+       intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
+       intel_ring_emit(ring, addr);
+       intel_ring_emit(ring, value);
+}
+
+static int gen8_init_workarounds(struct intel_engine_cs *ring)
+{
+       int ret;
+
+       /*
+        * workarounds applied in this fn are part of register state context,
+        * they need to be re-initialized followed by gpu reset, suspend/resume,
+        * module reload.
+        */
+
+       /*
+        * update the number of dwords required based on the
+        * actual number of workarounds applied
+        */
+       ret = intel_ring_begin(ring, 24);
+       if (ret)
+               return ret;
+
+       /* WaDisablePartialInstShootdown:bdw */
+       /* WaDisableThreadStallDopClockGating:bdw */
+       /* FIXME: Unclear whether we really need this on production bdw. */
+       intel_ring_emit_wa(ring, GEN8_ROW_CHICKEN,
+                          _MASKED_BIT_ENABLE(PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE
+                                            | STALL_DOP_GATING_DISABLE));
+
+       /* WaDisableDopClockGating:bdw May not be needed for production */
+       intel_ring_emit_wa(ring, GEN7_ROW_CHICKEN2,
+                          _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE));
+
+       /*
+        * This GEN8_CENTROID_PIXEL_OPT_DIS W/A is only needed for
+        * pre-production hardware
+        */
+       intel_ring_emit_wa(ring, HALF_SLICE_CHICKEN3,
+                          _MASKED_BIT_ENABLE(GEN8_CENTROID_PIXEL_OPT_DIS
+                                             | GEN8_SAMPLER_POWER_BYPASS_DIS));
+
+       intel_ring_emit_wa(ring, GEN7_HALF_SLICE_CHICKEN1,
+                          _MASKED_BIT_ENABLE(GEN7_SINGLE_SUBSCAN_DISPATCH_ENABLE));
+
+       intel_ring_emit_wa(ring, COMMON_SLICE_CHICKEN2,
+                          _MASKED_BIT_ENABLE(GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE));
+
+       /* Use Force Non-Coherent whenever executing a 3D context. This is a
+        * workaround for for a possible hang in the unlikely event a TLB
+        * invalidation occurs during a PSD flush.
+        */
+       intel_ring_emit_wa(ring, HDC_CHICKEN0,
+                          _MASKED_BIT_ENABLE(HDC_FORCE_NON_COHERENT));
+
+       /* Wa4x4STCOptimizationDisable:bdw */
+       intel_ring_emit_wa(ring, CACHE_MODE_1,
+                          _MASKED_BIT_ENABLE(GEN8_4x4_STC_OPTIMIZATION_DISABLE));
+
+       /*
+        * BSpec recommends 8x4 when MSAA is used,
+        * however in practice 16x4 seems fastest.
+        *
+        * Note that PS/WM thread counts depend on the WIZ hashing
+        * disable bit, which we don't touch here, but it's good
+        * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM).
+        */
+       intel_ring_emit_wa(ring, GEN7_GT_MODE,
+                          GEN6_WIZ_HASHING_MASK | GEN6_WIZ_HASHING_16x4);
+
+       intel_ring_advance(ring);
+
+       return 0;
+}
+
 static int init_render_ring(struct intel_engine_cs *ring)
 {
        struct drm_device *dev = ring->dev;
@@ -2143,6 +2221,7 @@ int intel_init_render_ring_buffer(struct drm_device *dev)
                                        dev_priv->semaphore_obj = obj;
                        }
                }
+               ring->init_context = gen8_init_workarounds;
                ring->add_request = gen6_add_request;
                ring->flush = gen8_render_ring_flush;
                ring->irq_get = gen8_ring_get_irq;
index 9cbf7b0ebc994fd6bd383fcb112d9b4413c1932f..96479c89f4bda9c2e6af0084e75b6785eee6ccfa 100644 (file)
@@ -148,6 +148,8 @@ struct  intel_engine_cs {
 
        int             (*init)(struct intel_engine_cs *ring);
 
+       int             (*init_context)(struct intel_engine_cs *ring);
+
        void            (*write_tail)(struct intel_engine_cs *ring,
                                      u32 value);
        int __must_check (*flush)(struct intel_engine_cs *ring,