drm/radeon: rework and fix reset detection v2
authorChristian König <christian.koenig@amd.com>
Tue, 29 Oct 2013 19:14:47 +0000 (20:14 +0100)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 1 Nov 2013 19:25:51 +0000 (15:25 -0400)
Stop fiddling with jiffies, always wait for RADEON_FENCE_JIFFIES_TIMEOUT.
Consolidate the two wait sequence implementations into just one function.
Activate all waiters and remember if the reset was already done instead of
trying to reset from only one thread.

v2: clear reset flag earlier to avoid timeout in IB test

Signed-off-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/radeon/radeon.h
drivers/gpu/drm/radeon/radeon_device.c
drivers/gpu/drm/radeon/radeon_fence.c

index ad54525cfa79680c6327902f4ca17353f49426aa..14531e155b3521d5e10412fea90de940c5f0bf00 100644 (file)
@@ -328,7 +328,6 @@ struct radeon_fence_driver {
        /* sync_seq is protected by ring emission lock */
        uint64_t                        sync_seq[RADEON_NUM_RINGS];
        atomic64_t                      last_seq;
-       unsigned long                   last_activity;
        bool                            initialized;
 };
 
@@ -2171,6 +2170,7 @@ struct radeon_device {
        bool                            need_dma32;
        bool                            accel_working;
        bool                            fastfb_working; /* IGP feature*/
+       bool                            needs_reset;
        struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES];
        const struct firmware *me_fw;   /* all family ME firmware */
        const struct firmware *pfp_fw;  /* r6/700 PFP firmware */
index b512c0098ee9b7df58f20f928b8125e205db70a9..61dbdd9381330a4d803d0a6ceccb5fe0ec67eda0 100644 (file)
@@ -1574,6 +1574,14 @@ int radeon_gpu_reset(struct radeon_device *rdev)
        int resched;
 
        down_write(&rdev->exclusive_lock);
+
+       if (!rdev->needs_reset) {
+               up_write(&rdev->exclusive_lock);
+               return 0;
+       }
+
+       rdev->needs_reset = false;
+
        radeon_save_bios_scratch_regs(rdev);
        /* block TTM */
        resched = ttm_bo_lock_delayed_workqueue(&rdev->mman.bdev);
index ddb8f8e04eb549f4fd264b2f704cdd407bcc65e0..b8f68b2c47d4ae7f6302a5c47aa6f9d3e282c2bc 100644 (file)
@@ -190,10 +190,8 @@ void radeon_fence_process(struct radeon_device *rdev, int ring)
                }
        } while (atomic64_xchg(&rdev->fence_drv[ring].last_seq, seq) > seq);
 
-       if (wake) {
-               rdev->fence_drv[ring].last_activity = jiffies;
+       if (wake)
                wake_up_all(&rdev->fence_queue);
-       }
 }
 
 /**
@@ -212,13 +210,13 @@ static void radeon_fence_destroy(struct kref *kref)
 }
 
 /**
- * radeon_fence_seq_signaled - check if a fence sequeuce number has signaled
+ * radeon_fence_seq_signaled - check if a fence sequence number has signaled
  *
  * @rdev: radeon device pointer
  * @seq: sequence number
  * @ring: ring index the fence is associated with
  *
- * Check if the last singled fence sequnce number is >= the requested
+ * Check if the last signaled fence sequnce number is >= the requested
  * sequence number (all asics).
  * Returns true if the fence has signaled (current fence value
  * is >= requested value) or false if it has not (current fence
@@ -263,113 +261,131 @@ bool radeon_fence_signaled(struct radeon_fence *fence)
 }
 
 /**
- * radeon_fence_wait_seq - wait for a specific sequence number
+ * radeon_fence_any_seq_signaled - check if any sequence number is signaled
  *
  * @rdev: radeon device pointer
- * @target_seq: sequence number we want to wait for
- * @ring: ring index the fence is associated with
+ * @seq: sequence numbers
+ *
+ * Check if the last signaled fence sequnce number is >= the requested
+ * sequence number (all asics).
+ * Returns true if any has signaled (current value is >= requested value)
+ * or false if it has not. Helper function for radeon_fence_wait_seq.
+ */
+static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
+{
+       unsigned i;
+
+       for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+               if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i))
+                       return true;
+       }
+       return false;
+}
+
+/**
+ * radeon_fence_wait_seq - wait for a specific sequence numbers
+ *
+ * @rdev: radeon device pointer
+ * @target_seq: sequence number(s) we want to wait for
  * @intr: use interruptable sleep
  * @lock_ring: whether the ring should be locked or not
  *
- * Wait for the requested sequence number to be written (all asics).
+ * Wait for the requested sequence number(s) to be written by any ring
+ * (all asics).  Sequnce number array is indexed by ring id.
  * @intr selects whether to use interruptable (true) or non-interruptable
  * (false) sleep when waiting for the sequence number.  Helper function
- * for radeon_fence_wait(), et al.
+ * for radeon_fence_wait_*().
  * Returns 0 if the sequence number has passed, error for all other cases.
- * -EDEADLK is returned when a GPU lockup has been detected and the ring is
- * marked as not ready so no further jobs get scheduled until a successful
- * reset.
+ * -EDEADLK is returned when a GPU lockup has been detected.
  */
-static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 target_seq,
-                                unsigned ring, bool intr, bool lock_ring)
+static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 *target_seq,
+                                bool intr, bool lock_ring)
 {
-       unsigned long timeout, last_activity;
-       uint64_t seq;
-       unsigned i;
+       uint64_t last_seq[RADEON_NUM_RINGS];
        bool signaled;
-       int r;
+       int i, r;
+
+       while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
+
+               /* Save current sequence values, used to check for GPU lockups */
+               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+                       if (!target_seq[i])
+                               continue;
 
-       while (target_seq > atomic64_read(&rdev->fence_drv[ring].last_seq)) {
-               if (!rdev->ring[ring].ready) {
-                       return -EBUSY;
+                       last_seq[i] = atomic64_read(&rdev->fence_drv[i].last_seq);
+                       trace_radeon_fence_wait_begin(rdev->ddev, target_seq[i]);
+                       radeon_irq_kms_sw_irq_get(rdev, i);
                }
 
-               timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
-               if (time_after(rdev->fence_drv[ring].last_activity, timeout)) {
-                       /* the normal case, timeout is somewhere before last_activity */
-                       timeout = rdev->fence_drv[ring].last_activity - timeout;
+               if (intr) {
+                       r = wait_event_interruptible_timeout(rdev->fence_queue, (
+                               (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
+                                || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
                } else {
-                       /* either jiffies wrapped around, or no fence was signaled in the last 500ms
-                        * anyway we will just wait for the minimum amount and then check for a lockup
-                        */
-                       timeout = 1;
+                       r = wait_event_timeout(rdev->fence_queue, (
+                               (signaled = radeon_fence_any_seq_signaled(rdev, target_seq))
+                                || rdev->needs_reset), RADEON_FENCE_JIFFIES_TIMEOUT);
                }
-               seq = atomic64_read(&rdev->fence_drv[ring].last_seq);
-               /* Save current last activity valuee, used to check for GPU lockups */
-               last_activity = rdev->fence_drv[ring].last_activity;
 
-               trace_radeon_fence_wait_begin(rdev->ddev, seq);
-               radeon_irq_kms_sw_irq_get(rdev, ring);
-               if (intr) {
-                       r = wait_event_interruptible_timeout(rdev->fence_queue,
-                               (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
-                               timeout);
-                } else {
-                       r = wait_event_timeout(rdev->fence_queue,
-                               (signaled = radeon_fence_seq_signaled(rdev, target_seq, ring)),
-                               timeout);
+               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+                       if (!target_seq[i])
+                               continue;
+
+                       radeon_irq_kms_sw_irq_put(rdev, i);
+                       trace_radeon_fence_wait_end(rdev->ddev, target_seq[i]);
                }
-               radeon_irq_kms_sw_irq_put(rdev, ring);
-               if (unlikely(r < 0)) {
+
+               if (unlikely(r < 0))
                        return r;
-               }
-               trace_radeon_fence_wait_end(rdev->ddev, seq);
 
                if (unlikely(!signaled)) {
+                       if (rdev->needs_reset)
+                               return -EDEADLK;
+
                        /* we were interrupted for some reason and fence
                         * isn't signaled yet, resume waiting */
-                       if (r) {
+                       if (r)
                                continue;
+
+                       for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+                               if (!target_seq[i])
+                                       continue;
+
+                               if (last_seq[i] != atomic64_read(&rdev->fence_drv[i].last_seq))
+                                       break;
                        }
 
-                       /* check if sequence value has changed since last_activity */
-                       if (seq != atomic64_read(&rdev->fence_drv[ring].last_seq)) {
+                       if (i != RADEON_NUM_RINGS)
                                continue;
-                       }
 
-                       if (lock_ring) {
+                       if (lock_ring)
                                mutex_lock(&rdev->ring_lock);
-                       }
 
-                       /* test if somebody else has already decided that this is a lockup */
-                       if (last_activity != rdev->fence_drv[ring].last_activity) {
-                               if (lock_ring) {
-                                       mutex_unlock(&rdev->ring_lock);
-                               }
-                               continue;
+                       for (i = 0; i < RADEON_NUM_RINGS; ++i) {
+                               if (!target_seq[i])
+                                       continue;
+
+                               if (radeon_ring_is_lockup(rdev, i, &rdev->ring[i]))
+                                       break;
                        }
 
-                       if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
+                       if (i < RADEON_NUM_RINGS) {
                                /* good news we believe it's a lockup */
-                               dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx last fence id 0x%016llx)\n",
-                                        target_seq, seq);
-
-                               /* change last activity so nobody else think there is a lockup */
-                               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-                                       rdev->fence_drv[i].last_activity = jiffies;
-                               }
-
-                               /* mark the ring as not ready any more */
-                               rdev->ring[ring].ready = false;
-                               if (lock_ring) {
+                               dev_warn(rdev->dev, "GPU lockup (waiting for "
+                                        "0x%016llx last fence id 0x%016llx on"
+                                        " ring %d)\n",
+                                        target_seq[i], last_seq[i], i);
+
+                               /* remember that we need an reset */
+                               rdev->needs_reset = true;
+                               if (lock_ring)
                                        mutex_unlock(&rdev->ring_lock);
-                               }
+                               wake_up_all(&rdev->fence_queue);
                                return -EDEADLK;
                        }
 
-                       if (lock_ring) {
+                       if (lock_ring)
                                mutex_unlock(&rdev->ring_lock);
-                       }
                }
        }
        return 0;
@@ -388,6 +404,7 @@ static int radeon_fence_wait_seq(struct radeon_device *rdev, u64 target_seq,
  */
 int radeon_fence_wait(struct radeon_fence *fence, bool intr)
 {
+       uint64_t seq[RADEON_NUM_RINGS] = {};
        int r;
 
        if (fence == NULL) {
@@ -395,147 +412,15 @@ int radeon_fence_wait(struct radeon_fence *fence, bool intr)
                return -EINVAL;
        }
 
-       r = radeon_fence_wait_seq(fence->rdev, fence->seq,
-                                 fence->ring, intr, true);
-       if (r) {
-               return r;
-       }
-       fence->seq = RADEON_FENCE_SIGNALED_SEQ;
-       return 0;
-}
-
-static bool radeon_fence_any_seq_signaled(struct radeon_device *rdev, u64 *seq)
-{
-       unsigned i;
-
-       for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-               if (seq[i] && radeon_fence_seq_signaled(rdev, seq[i], i)) {
-                       return true;
-               }
-       }
-       return false;
-}
-
-/**
- * radeon_fence_wait_any_seq - wait for a sequence number on any ring
- *
- * @rdev: radeon device pointer
- * @target_seq: sequence number(s) we want to wait for
- * @intr: use interruptable sleep
- *
- * Wait for the requested sequence number(s) to be written by any ring
- * (all asics).  Sequnce number array is indexed by ring id.
- * @intr selects whether to use interruptable (true) or non-interruptable
- * (false) sleep when waiting for the sequence number.  Helper function
- * for radeon_fence_wait_any(), et al.
- * Returns 0 if the sequence number has passed, error for all other cases.
- */
-static int radeon_fence_wait_any_seq(struct radeon_device *rdev,
-                                    u64 *target_seq, bool intr)
-{
-       unsigned long timeout, last_activity, tmp;
-       unsigned i, ring = RADEON_NUM_RINGS;
-       bool signaled;
-       int r;
-
-       for (i = 0, last_activity = 0; i < RADEON_NUM_RINGS; ++i) {
-               if (!target_seq[i]) {
-                       continue;
-               }
-
-               /* use the most recent one as indicator */
-               if (time_after(rdev->fence_drv[i].last_activity, last_activity)) {
-                       last_activity = rdev->fence_drv[i].last_activity;
-               }
-
-               /* For lockup detection just pick the lowest ring we are
-                * actively waiting for
-                */
-               if (i < ring) {
-                       ring = i;
-               }
-       }
-
-       /* nothing to wait for ? */
-       if (ring == RADEON_NUM_RINGS) {
-               return -ENOENT;
-       }
-
-       while (!radeon_fence_any_seq_signaled(rdev, target_seq)) {
-               timeout = jiffies - RADEON_FENCE_JIFFIES_TIMEOUT;
-               if (time_after(last_activity, timeout)) {
-                       /* the normal case, timeout is somewhere before last_activity */
-                       timeout = last_activity - timeout;
-               } else {
-                       /* either jiffies wrapped around, or no fence was signaled in the last 500ms
-                        * anyway we will just wait for the minimum amount and then check for a lockup
-                        */
-                       timeout = 1;
-               }
+       seq[fence->ring] = fence->seq;
+       if (seq[fence->ring] == RADEON_FENCE_SIGNALED_SEQ)
+               return 0;
 
-               trace_radeon_fence_wait_begin(rdev->ddev, target_seq[ring]);
-               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-                       if (target_seq[i]) {
-                               radeon_irq_kms_sw_irq_get(rdev, i);
-                       }
-               }
-               if (intr) {
-                       r = wait_event_interruptible_timeout(rdev->fence_queue,
-                               (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
-                               timeout);
-               } else {
-                       r = wait_event_timeout(rdev->fence_queue,
-                               (signaled = radeon_fence_any_seq_signaled(rdev, target_seq)),
-                               timeout);
-               }
-               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-                       if (target_seq[i]) {
-                               radeon_irq_kms_sw_irq_put(rdev, i);
-                       }
-               }
-               if (unlikely(r < 0)) {
-                       return r;
-               }
-               trace_radeon_fence_wait_end(rdev->ddev, target_seq[ring]);
-
-               if (unlikely(!signaled)) {
-                       /* we were interrupted for some reason and fence
-                        * isn't signaled yet, resume waiting */
-                       if (r) {
-                               continue;
-                       }
-
-                       mutex_lock(&rdev->ring_lock);
-                       for (i = 0, tmp = 0; i < RADEON_NUM_RINGS; ++i) {
-                               if (time_after(rdev->fence_drv[i].last_activity, tmp)) {
-                                       tmp = rdev->fence_drv[i].last_activity;
-                               }
-                       }
-                       /* test if somebody else has already decided that this is a lockup */
-                       if (last_activity != tmp) {
-                               last_activity = tmp;
-                               mutex_unlock(&rdev->ring_lock);
-                               continue;
-                       }
-
-                       if (radeon_ring_is_lockup(rdev, ring, &rdev->ring[ring])) {
-                               /* good news we believe it's a lockup */
-                               dev_warn(rdev->dev, "GPU lockup (waiting for 0x%016llx)\n",
-                                        target_seq[ring]);
-
-                               /* change last activity so nobody else think there is a lockup */
-                               for (i = 0; i < RADEON_NUM_RINGS; ++i) {
-                                       rdev->fence_drv[i].last_activity = jiffies;
-                               }
+       r = radeon_fence_wait_seq(fence->rdev, seq, intr, true);
+       if (r)
+               return r;
 
-                               /* mark the ring as not ready any more */
-                               rdev->ring[ring].ready = false;
-                               mutex_unlock(&rdev->ring_lock);
-                               return -EDEADLK;
-                       }
-                       mutex_unlock(&rdev->ring_lock);
-               }
-       }
+       fence->seq = RADEON_FENCE_SIGNALED_SEQ;
        return 0;
 }
 
@@ -557,7 +442,7 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
                          bool intr)
 {
        uint64_t seq[RADEON_NUM_RINGS];
-       unsigned i;
+       unsigned i, num_rings = 0;
        int r;
 
        for (i = 0; i < RADEON_NUM_RINGS; ++i) {
@@ -567,15 +452,19 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
                        continue;
                }
 
-               if (fences[i]->seq == RADEON_FENCE_SIGNALED_SEQ) {
-                       /* something was allready signaled */
-                       return 0;
-               }
-
                seq[i] = fences[i]->seq;
+               ++num_rings;
+
+               /* test if something was allready signaled */
+               if (seq[i] == RADEON_FENCE_SIGNALED_SEQ)
+                       return 0;
        }
 
-       r = radeon_fence_wait_any_seq(rdev, seq, intr);
+       /* nothing to wait for ? */
+       if (num_rings == 0)
+               return -ENOENT;
+
+       r = radeon_fence_wait_seq(rdev, seq, intr, true);
        if (r) {
                return r;
        }
@@ -594,15 +483,15 @@ int radeon_fence_wait_any(struct radeon_device *rdev,
  */
 int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring)
 {
-       uint64_t seq;
+       uint64_t seq[RADEON_NUM_RINGS] = {};
 
-       seq = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
-       if (seq >= rdev->fence_drv[ring].sync_seq[ring]) {
+       seq[ring] = atomic64_read(&rdev->fence_drv[ring].last_seq) + 1ULL;
+       if (seq[ring] >= rdev->fence_drv[ring].sync_seq[ring]) {
                /* nothing to wait for, last_seq is
                   already the last emited fence */
                return -ENOENT;
        }
-       return radeon_fence_wait_seq(rdev, seq, ring, false, false);
+       return radeon_fence_wait_seq(rdev, seq, false, false);
 }
 
 /**
@@ -617,14 +506,15 @@ int radeon_fence_wait_next_locked(struct radeon_device *rdev, int ring)
  */
 int radeon_fence_wait_empty_locked(struct radeon_device *rdev, int ring)
 {
-       uint64_t seq = rdev->fence_drv[ring].sync_seq[ring];
+       uint64_t seq[RADEON_NUM_RINGS] = {};
        int r;
 
-       r = radeon_fence_wait_seq(rdev, seq, ring, false, false);
+       seq[ring] = rdev->fence_drv[ring].sync_seq[ring];
+       r = radeon_fence_wait_seq(rdev, seq, false, false);
        if (r) {
-               if (r == -EDEADLK) {
+               if (r == -EDEADLK)
                        return -EDEADLK;
-               }
+
                dev_err(rdev->dev, "error waiting for ring[%d] to become idle (%d)\n",
                        ring, r);
        }
@@ -826,7 +716,6 @@ static void radeon_fence_driver_init_ring(struct radeon_device *rdev, int ring)
        for (i = 0; i < RADEON_NUM_RINGS; ++i)
                rdev->fence_drv[ring].sync_seq[i] = 0;
        atomic64_set(&rdev->fence_drv[ring].last_seq, 0);
-       rdev->fence_drv[ring].last_activity = jiffies;
        rdev->fence_drv[ring].initialized = false;
 }