enum intel_engine_id id;
if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
- seq_printf(m, "Wedged\n");
- if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
- seq_printf(m, "Reset in progress\n");
+ seq_puts(m, "Wedged\n");
+ if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
+ seq_puts(m, "Reset in progress: struct_mutex backoff\n");
+ if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
+ seq_puts(m, "Reset in progress: reset handoff to waiter\n");
if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
- seq_printf(m, "Waiter holding struct mutex\n");
+ seq_puts(m, "Waiter holding struct mutex\n");
if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
- seq_printf(m, "struct_mutex blocked for reset\n");
+ seq_puts(m, "struct_mutex blocked for reset\n");
if (!i915.enable_hangcheck) {
- seq_printf(m, "Hangcheck disabled\n");
+ seq_puts(m, "Hangcheck disabled\n");
return 0;
}
* while it is writing to 'i915_wedged'
*/
- if (i915_reset_in_progress(&dev_priv->gpu_error))
+ if (i915_reset_backoff(&dev_priv->gpu_error))
return -EAGAIN;
i915_handle_error(dev_priv, val,
int ret;
lockdep_assert_held(&dev_priv->drm.struct_mutex);
+ GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
- if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
+ if (!test_bit(I915_RESET_HANDOFF, &error->flags))
return;
/* Clear any previous failed attempts at recovery. Time to try again. */
wakeup:
i915_gem_reset_finish(dev_priv);
enable_irq(dev_priv->drm.irq);
- wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
+
+ clear_bit(I915_RESET_HANDOFF, &error->flags);
+ wake_up_bit(&error->flags, I915_RESET_HANDOFF);
return;
error:
*/
unsigned long reset_count;
+ /**
+ * flags: Control various stages of the GPU reset
+ *
+ * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
+ * other users acquiring the struct_mutex. To do this we set the
+ * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
+ * and then check for that bit before acquiring the struct_mutex (in
+ * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
+ * secondary role in preventing two concurrent global reset attempts.
+ *
+ * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
+ * struct_mutex. We try to acquire the struct_mutex in the reset worker,
+ * but it may be held by some long running waiter (that we cannot
+ * interrupt without causing trouble). Once we are ready to do the GPU
+ * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
+ * they already hold the struct_mutex and want to participate they can
+ * inspect the bit and do the reset directly, otherwise the worker
+ * waits for the struct_mutex.
+ *
+ * #I915_WEDGED - If reset fails and we can no longer use the GPU,
+ * we set the #I915_WEDGED bit. Prior to command submission, e.g.
+ * i915_gem_request_alloc(), this bit is checked and the sequence
+ * aborted (with -EIO reported to userspace) if set.
+ */
unsigned long flags;
-#define I915_RESET_IN_PROGRESS 0
+#define I915_RESET_BACKOFF 0
+#define I915_RESET_HANDOFF 1
#define I915_WEDGED (BITS_PER_LONG - 1)
/**
void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
-static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff(struct i915_gpu_error *error)
+{
+ return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
+}
+
+static inline bool i915_reset_handoff(struct i915_gpu_error *error)
{
- return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
+ return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
}
static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
return unlikely(test_bit(I915_WEDGED, &error->flags));
}
-static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
{
- return i915_reset_in_progress(error) | i915_terminally_wedged(error);
+ return i915_reset_backoff(error) | i915_terminally_wedged(error);
}
static inline u32 i915_reset_count(struct i915_gpu_error *error)
might_sleep();
- if (!i915_reset_in_progress(error))
- return 0;
-
/*
* Only wait 10 seconds for the gpu reset to complete to avoid hanging
* userspace. If it takes that long something really bad is going on and
* we should simply try to bail out and fail as gracefully as possible.
*/
ret = wait_event_interruptible_timeout(error->reset_queue,
- !i915_reset_in_progress(error),
+ !i915_reset_backoff(error),
I915_RESET_TIMEOUT);
if (ret == 0) {
DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
{
- if (likely(!i915_reset_in_progress(&request->i915->gpu_error)))
+ if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
return false;
__set_current_state(TASK_RUNNING);
return ret;
}
-static void i915_error_wake_up(struct drm_i915_private *dev_priv)
-{
- /*
- * Notify all waiters for GPU completion events that reset state has
- * been changed, and that they need to restart their wait after
- * checking for potential errors (and bail out to drop locks if there is
- * a gpu reset pending so that i915_error_work_func can acquire them).
- */
-
- /* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
- wake_up_all(&dev_priv->gpu_error.wait_queue);
-
- /* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
- wake_up_all(&dev_priv->pending_flip_queue);
-}
-
/**
* i915_reset_and_wakeup - do process context error handling work
* @dev_priv: i915 device private
intel_prepare_reset(dev_priv);
+ set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
+ wake_up_all(&dev_priv->gpu_error.wait_queue);
+
do {
/*
* All state reset _must_ be completed before we update the
/* We need to wait for anyone holding the lock to wakeup */
} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
- I915_RESET_IN_PROGRESS,
+ I915_RESET_HANDOFF,
TASK_UNINTERRUPTIBLE,
HZ));
* Note: The wake_up also serves as a memory barrier so that
* waiters see the updated value of the dev_priv->gpu_error.
*/
+ clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
wake_up_all(&dev_priv->gpu_error.reset_queue);
}
if (!engine_mask)
goto out;
- if (test_and_set_bit(I915_RESET_IN_PROGRESS,
+ if (test_and_set_bit(I915_RESET_BACKOFF,
&dev_priv->gpu_error.flags))
goto out;
- /*
- * Wakeup waiting processes so that the reset function
- * i915_reset_and_wakeup doesn't deadlock trying to grab
- * various locks. By bumping the reset counter first, the woken
- * processes will see a reset in progress and back off,
- * releasing their locks and then wait for the reset completion.
- * We must do this for _all_ gpu waiters that might hold locks
- * that the reset work needs to acquire.
- *
- * Note: The wake_up also provides a memory barrier to ensure that the
- * waiters see the updated value of the reset flags.
- */
- i915_error_wake_up(dev_priv);
-
i915_reset_and_wakeup(dev_priv);
out:
{
struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
- if (i915_reset_in_progress(error))
+ if (i915_reset_backoff(error))
return true;
if (crtc->reset_count != i915_reset_count(error))
goto cleanup;
intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
- if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
+ if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
ret = -EIO;
goto unlock;
}
/* Check that we can issue a global GPU reset */
- set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+ set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+ set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex);
reset_count = i915_reset_count(&i915->gpu_error);
}
mutex_unlock(&i915->drm.struct_mutex);
- GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
+ GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
+ clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
if (i915_terminally_wedged(&i915->gpu_error))
err = -EIO;
reset_count = i915_reset_count(&rq->i915->gpu_error);
- set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags);
+ set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
wake_up_all(&rq->i915->gpu_error.wait_queue);
return reset_count;
/* Check that we detect a stuck waiter and issue a reset */
- set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+ set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex);
err = hang_init(&h, i915);
err = timeout;
goto out_rq;
}
- GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
+ GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
if (i915_reset_count(&i915->gpu_error) == reset_count) {
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
hang_fini(&h);
unlock:
mutex_unlock(&i915->drm.struct_mutex);
+ clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
if (i915_terminally_wedged(&i915->gpu_error))
return -EIO;
if (!igt_can_mi_store_dword_imm(i915))
return 0;
+ set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
mutex_lock(&i915->drm.struct_mutex);
err = hang_init(&h, i915);
if (err)
i915_reset(i915);
- GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS,
+ GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
&i915->gpu_error.flags));
+
if (prev->fence.error != -EIO) {
pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
prev->fence.error);
hang_fini(&h);
unlock:
mutex_unlock(&i915->drm.struct_mutex);
+ clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
if (i915_terminally_wedged(&i915->gpu_error))
return -EIO;