drm/amdgpu: add amdgpu soft reset
authorChunming Zhou <David1.Zhou@amd.com>
Fri, 15 Jul 2016 07:57:13 +0000 (15:57 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 8 Aug 2016 15:32:05 +0000 (11:32 -0400)
Check gpu status first, if MC/VMC/DISPLAY hang, directly triger full reset.
If engine hangs, then triger engine soft reset, if soft reset fails, will
fallback to full reset.

Signed-off-by: Chunming Zhou <David1.Zhou@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/include/amd_shared.h

index b88620586c8ef83ddca00730c85bcff4b9ff1473..2bd2b19d46668218f0651274e17fc7c8d1b0a04f 100644 (file)
@@ -1962,7 +1962,8 @@ int amdgpu_pre_soft_reset(struct amdgpu_device *adev)
        for (i = 0; i < adev->num_ip_blocks; i++) {
                if (!adev->ip_block_status[i].valid)
                        continue;
-               if (adev->ip_blocks[i].funcs->pre_soft_reset) {
+               if (adev->ip_block_status[i].hang &&
+                   adev->ip_blocks[i].funcs->pre_soft_reset) {
                        r = adev->ip_blocks[i].funcs->pre_soft_reset(adev);
                        if (r)
                                return r;
@@ -1972,6 +1973,58 @@ int amdgpu_pre_soft_reset(struct amdgpu_device *adev)
        return 0;
 }
 
+static bool amdgpu_need_full_reset(struct amdgpu_device *adev)
+{
+       if (adev->ip_block_status[AMD_IP_BLOCK_TYPE_GMC].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_IH].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_SMC].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_GFX].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_SDMA].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_UVD].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_VCE].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_ACP].hang ||
+           adev->ip_block_status[AMD_IP_BLOCK_TYPE_DCE].hang) {
+               DRM_INFO("Some block need full reset!\n");
+               return true;
+       }
+       return false;
+}
+
+static int amdgpu_soft_reset(struct amdgpu_device *adev)
+{
+       int i, r = 0;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_block_status[i].valid)
+                       continue;
+               if (adev->ip_block_status[i].hang &&
+                   adev->ip_blocks[i].funcs->soft_reset) {
+                       r = adev->ip_blocks[i].funcs->soft_reset(adev);
+                       if (r)
+                               return r;
+               }
+       }
+
+       return 0;
+}
+
+static int amdgpu_post_soft_reset(struct amdgpu_device *adev)
+{
+       int i, r = 0;
+
+       for (i = 0; i < adev->num_ip_blocks; i++) {
+               if (!adev->ip_block_status[i].valid)
+                       continue;
+               if (adev->ip_block_status[i].hang &&
+                   adev->ip_blocks[i].funcs->post_soft_reset)
+                       r = adev->ip_blocks[i].funcs->post_soft_reset(adev);
+               if (r)
+                       return r;
+       }
+
+       return 0;
+}
+
 /**
  * amdgpu_gpu_reset - reset the asic
  *
@@ -1984,6 +2037,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 {
        int i, r;
        int resched;
+       bool need_full_reset;
 
        if (!amdgpu_check_soft_reset(adev)) {
                DRM_INFO("No hardware hang detected. Did some blocks stall?\n");
@@ -2007,28 +2061,42 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
        /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
        amdgpu_fence_driver_force_completion(adev);
 
-       /* save scratch */
-       amdgpu_atombios_scratch_regs_save(adev);
-       r = amdgpu_suspend(adev);
+       need_full_reset = amdgpu_need_full_reset(adev);
 
-retry:
-       /* Disable fb access */
-       if (adev->mode_info.num_crtc) {
-               struct amdgpu_mode_mc_save save;
-               amdgpu_display_stop_mc_access(adev, &save);
-               amdgpu_wait_for_idle(adev, AMD_IP_BLOCK_TYPE_GMC);
+       if (!need_full_reset) {
+               amdgpu_pre_soft_reset(adev);
+               r = amdgpu_soft_reset(adev);
+               amdgpu_post_soft_reset(adev);
+               if (r || amdgpu_check_soft_reset(adev)) {
+                       DRM_INFO("soft reset failed, will fallback to full reset!\n");
+                       need_full_reset = true;
+               }
        }
 
-       r = amdgpu_asic_reset(adev);
-       /* post card */
-       amdgpu_atom_asic_init(adev->mode_info.atom_context);
+       if (need_full_reset) {
+               /* save scratch */
+               amdgpu_atombios_scratch_regs_save(adev);
+               r = amdgpu_suspend(adev);
 
-       if (!r) {
-               dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
-               r = amdgpu_resume(adev);
+retry:
+               /* Disable fb access */
+               if (adev->mode_info.num_crtc) {
+                       struct amdgpu_mode_mc_save save;
+                       amdgpu_display_stop_mc_access(adev, &save);
+                       amdgpu_wait_for_idle(adev, AMD_IP_BLOCK_TYPE_GMC);
+               }
+
+               r = amdgpu_asic_reset(adev);
+               /* post card */
+               amdgpu_atom_asic_init(adev->mode_info.atom_context);
+
+               if (!r) {
+                       dev_info(adev->dev, "GPU reset succeeded, trying to resume\n");
+                       r = amdgpu_resume(adev);
+               }
+               /* restore scratch */
+               amdgpu_atombios_scratch_regs_restore(adev);
        }
-       /* restore scratch */
-       amdgpu_atombios_scratch_regs_restore(adev);
        if (!r) {
                r = amdgpu_ib_ring_tests(adev);
                if (r) {
index 2a2a5aa39b994829f091f0dcfb42bf28f9d28d55..db710418f35fa0f8e38f5dae47c5084faccf2a79 100644 (file)
@@ -165,6 +165,8 @@ struct amd_ip_funcs {
        int (*pre_soft_reset)(void *handle);
        /* soft reset the IP block */
        int (*soft_reset)(void *handle);
+       /* post soft reset the IP block */
+       int (*post_soft_reset)(void *handle);
        /* enable/disable cg for the IP block */
        int (*set_clockgating_state)(void *handle,
                                     enum amd_clockgating_state state);