drm/i915: Fix __wait_seqno to use true infinite timeouts
authorChris Wilson <chris@chris-wilson.co.uk>
Wed, 25 Sep 2013 16:34:55 +0000 (17:34 +0100)
committerDaniel Vetter <daniel.vetter@ffwll.ch>
Thu, 3 Oct 2013 18:01:30 +0000 (20:01 +0200)
When we switched to always using a timeout in conjunction with
wait_seqno, we lost the ability to detect missed interrupts. Since, we
have had issues with interrupts on a number of generations, and they are
required to be delivered in a timely fashion for a smooth UX, it is
important that we do log errors found in the wild and prevent the
display stalling for upwards of 1s every time the seqno interrupt is
missed.

Rather than continue to fix up the timeouts to work around the interface
impedence in wait_event_*(), open code the combination of
wait_event[_interruptible][_timeout], and use the exposed timer to
poll for seqno should we detect a lost interrupt.

v2: In order to satisfy the debug requirement of logging missed
interrupts with the real world requirments of making machines work even
if interrupts are hosed, we revert to polling after detecting a missed
interrupt.

v3: Throw in a debugfs interface to simulate broken hw not reporting
interrupts.

v4: s/EGAIN/EAGAIN/ (Imre)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Imre Deak <imre.deak@intel.com>
[danvet: Don't use the struct typedef in new code.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_gem.c
drivers/gpu/drm/i915/i915_gpu_error.c
drivers/gpu/drm/i915/i915_irq.c

index fcfa98844ccc450d1b01d9bb67596d8cb4ba3b9c..bc5c04d5890f389063a3d1327b20da02601f4c6a 100644 (file)
@@ -1897,6 +1897,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
                        i915_ring_stop_get, i915_ring_stop_set,
                        "0x%08llx\n");
 
+static int
+i915_ring_missed_irq_get(void *data, u64 *val)
+{
+       struct drm_device *dev = data;
+       struct drm_i915_private *dev_priv = dev->dev_private;
+
+       *val = dev_priv->gpu_error.missed_irq_rings;
+       return 0;
+}
+
+static int
+i915_ring_missed_irq_set(void *data, u64 val)
+{
+       struct drm_device *dev = data;
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       int ret;
+
+       /* Lock against concurrent debugfs callers */
+       ret = mutex_lock_interruptible(&dev->struct_mutex);
+       if (ret)
+               return ret;
+       dev_priv->gpu_error.missed_irq_rings = val;
+       mutex_unlock(&dev->struct_mutex);
+
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
+                       i915_ring_missed_irq_get, i915_ring_missed_irq_set,
+                       "0x%08llx\n");
+
+static int
+i915_ring_test_irq_get(void *data, u64 *val)
+{
+       struct drm_device *dev = data;
+       struct drm_i915_private *dev_priv = dev->dev_private;
+
+       *val = dev_priv->gpu_error.test_irq_rings;
+
+       return 0;
+}
+
+static int
+i915_ring_test_irq_set(void *data, u64 val)
+{
+       struct drm_device *dev = data;
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       int ret;
+
+       DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
+
+       /* Lock against concurrent debugfs callers */
+       ret = mutex_lock_interruptible(&dev->struct_mutex);
+       if (ret)
+               return ret;
+
+       dev_priv->gpu_error.test_irq_rings = val;
+       mutex_unlock(&dev->struct_mutex);
+
+       return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
+                       i915_ring_test_irq_get, i915_ring_test_irq_set,
+                       "0x%08llx\n");
+
 #define DROP_UNBOUND 0x1
 #define DROP_BOUND 0x2
 #define DROP_RETIRE 0x4
@@ -2290,6 +2356,8 @@ static struct i915_debugfs_files {
        {"i915_min_freq", &i915_min_freq_fops},
        {"i915_cache_sharing", &i915_cache_sharing_fops},
        {"i915_ring_stop", &i915_ring_stop_fops},
+       {"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
+       {"i915_ring_test_irq", &i915_ring_test_irq_fops},
        {"i915_gem_drop_caches", &i915_drop_caches_fops},
        {"i915_error_state", &i915_error_state_fops},
        {"i915_next_seqno", &i915_next_seqno_fops},
index 08e96a8c01aa86d94a11116305c7efe646a19ea2..79bbcf925e4ad31c67c107320e81bad2d6f1e91a 100644 (file)
@@ -1013,6 +1013,9 @@ struct i915_gpu_error {
        struct drm_i915_error_state *first_error;
        struct work_struct work;
 
+
+       unsigned long missed_irq_rings;
+
        /**
         * State variable and reset counter controlling the reset flow
         *
@@ -1051,6 +1054,9 @@ struct i915_gpu_error {
 
        /* For gpu hang simulation. */
        unsigned int stop_rings;
+
+       /* For missed irq/seqno simulation. */
+       unsigned int test_irq_rings;
 };
 
 enum modeset_restore {
index 3ae925b0045f42d6099d4774ab2ae054a53f9c2e..53e315131700c79cc924dcc206171d597f046e40 100644 (file)
@@ -971,6 +971,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
        return ret;
 }
 
+static void fake_irq(unsigned long data)
+{
+       wake_up_process((struct task_struct *)data);
+}
+
+static bool missed_irq(struct drm_i915_private *dev_priv,
+                      struct intel_ring_buffer *ring)
+{
+       return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
+}
+
 /**
  * __wait_seqno - wait until execution of seqno has finished
  * @ring: the ring expected to report seqno
@@ -994,10 +1005,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
                        bool interruptible, struct timespec *timeout)
 {
        drm_i915_private_t *dev_priv = ring->dev->dev_private;
-       struct timespec before, now, wait_time={1,0};
-       unsigned long timeout_jiffies;
-       long end;
-       bool wait_forever = true;
+       struct timespec before, now;
+       DEFINE_WAIT(wait);
+       long timeout_jiffies;
        int ret;
 
        WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
@@ -1005,51 +1015,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
        if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
                return 0;
 
-       trace_i915_gem_request_wait_begin(ring, seqno);
-
-       if (timeout != NULL) {
-               wait_time = *timeout;
-               wait_forever = false;
-       }
-
-       timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
+       timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
 
-       if (WARN_ON(!ring->irq_get(ring)))
+       if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
+           WARN_ON(!ring->irq_get(ring)))
                return -ENODEV;
 
-       /* Record current time in case interrupted by signal, or wedged * */
+       /* Record current time in case interrupted by signal, or wedged */
+       trace_i915_gem_request_wait_begin(ring, seqno);
        getrawmonotonic(&before);
+       for (;;) {
+               struct timer_list timer;
+               unsigned long expire;
 
-#define EXIT_COND \
-       (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
-        i915_reset_in_progress(&dev_priv->gpu_error) || \
-        reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
-       do {
-               if (interruptible)
-                       end = wait_event_interruptible_timeout(ring->irq_queue,
-                                                              EXIT_COND,
-                                                              timeout_jiffies);
-               else
-                       end = wait_event_timeout(ring->irq_queue, EXIT_COND,
-                                                timeout_jiffies);
+               prepare_to_wait(&ring->irq_queue, &wait,
+                               interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
 
                /* We need to check whether any gpu reset happened in between
                 * the caller grabbing the seqno and now ... */
-               if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
-                       end = -EAGAIN;
+               if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
+                       /* ... but upgrade the -EAGAIN to an -EIO if the gpu
+                        * is truely gone. */
+                       ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+                       if (ret == 0)
+                               ret = -EAGAIN;
+                       break;
+               }
 
-               /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
-                * gone. */
-               ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
-               if (ret)
-                       end = ret;
-       } while (end == 0 && wait_forever);
+               if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
+                       ret = 0;
+                       break;
+               }
+
+               if (interruptible && signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+
+               if (timeout_jiffies <= 0) {
+                       ret = -ETIME;
+                       break;
+               }
 
+               timer.function = NULL;
+               if (timeout || missed_irq(dev_priv, ring)) {
+                       setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
+                       expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
+                       mod_timer(&timer, expire);
+               }
+
+               schedule();
+
+               if (timeout)
+                       timeout_jiffies = expire - jiffies;
+
+               if (timer.function) {
+                       del_singleshot_timer_sync(&timer);
+                       destroy_timer_on_stack(&timer);
+               }
+       }
        getrawmonotonic(&now);
+       trace_i915_gem_request_wait_end(ring, seqno);
 
        ring->irq_put(ring);
-       trace_i915_gem_request_wait_end(ring, seqno);
-#undef EXIT_COND
+
+       finish_wait(&ring->irq_queue, &wait);
 
        if (timeout) {
                struct timespec sleep_time = timespec_sub(now, before);
@@ -1058,17 +1088,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
                        set_normalized_timespec(timeout, 0, 0);
        }
 
-       switch (end) {
-       case -EIO:
-       case -EAGAIN: /* Wedged */
-       case -ERESTARTSYS: /* Signal */
-               return (int)end;
-       case 0: /* Timeout */
-               return -ETIME;
-       default: /* Completed */
-               WARN_ON(end < 0); /* We're not aware of other errors */
-               return 0;
-       }
+       return ret;
 }
 
 /**
index 0a49b651e51014de4db0f00af07f6dc15136c2a5..da1022a328e37e9cacb822569b135b7a22b702c6 100644 (file)
@@ -311,6 +311,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
        err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
        err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
        err_printf(m, "CCID: 0x%08x\n", error->ccid);
+       err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
 
        for (i = 0; i < dev_priv->num_fence_regs; i++)
                err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
index 84b7efc6ee9126236a90d57956ba7b20e81a780e..05c05a6a436006c08c55d95fc6eb367aec2991ad 100644 (file)
@@ -2039,10 +2039,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
 
                                if (waitqueue_active(&ring->irq_queue)) {
                                        /* Issue a wake-up to catch stuck h/w. */
-                                       DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-                                                 ring->name);
-                                       wake_up_all(&ring->irq_queue);
-                                       ring->hangcheck.score += HUNG;
+                                       if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
+                                               DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+                                                         ring->name);
+                                               wake_up_all(&ring->irq_queue);
+                                       }
+                                       /* Safeguard against driver failure */
+                                       ring->hangcheck.score += BUSY;
                                } else
                                        busy = false;
                        } else {