drm/amdgpu:changes in gfx DMAframe scheme (v2)
authorMonk Liu <Monk.Liu@amd.com>
Wed, 15 Mar 2017 04:18:57 +0000 (12:18 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 30 Mar 2017 03:55:42 +0000 (23:55 -0400)
1) Adapt to vulkan:
Now use double SWITCH BUFFER to replace the 128 nops w/a,
because when vulkan introduced, umd can insert 7 ~ 16 IBs
per submit which makes 256 DW size cannot hold the whole
DMAframe (if we still insert those 128 nops), CP team suggests
use double SWITCH_BUFFERs, instead of tricky 128 NOPs w/a.

2) To fix the CE VM fault issue when MCBP introduced:
Need one more COND_EXEC wrapping IB part (original one us
for VM switch part).

this change can fix vm fault issue caused by below scenario
without this change:

>CE passed original COND_EXEC (no MCBP issued this moment),
 proceed as normal.

>DE catch up to this COND_EXEC, but this time MCBP issued,
 thus DE treats all following packages as NOP. The following
 VM switch packages now looks just as NOP to DE, so DE
 dosen't do VM flush at all.

>Now CE proceeds to the first IBc, and triggers VM fault,
 because DE didn't do VM flush for this DMAframe.

3) change estimated alloc size for gfx9.
with new DMAframe scheme, we need modify emit_frame_size
for gfx9

4) No need to insert 128 nops after gfx8 vm flush anymore
because there was double SWITCH_BUFFER append to vm flush,
and for gfx7 we already use double SWITCH_BUFFER following
after vm_flush so no change needed for it.

5) Change emit_frame_size for gfx8

v2: squash in BUG removal from Monk

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index 2957404bd44a5bf5c32e13ff4b902877d06c1e49..c40c1a16e72e2acbcc9dbe24c7bb405be1e39c57 100644 (file)
@@ -912,7 +912,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 
                        /* each GFX command submit allows 0 or 1 IB preemptible for CE & DE */
                        if (ce_preempt > 1 || de_preempt > 1)
-                               BUG();
+                               return -EINVAL;
                }
 
                r = amdgpu_cs_get_ring(adev, chunk_ib->ip_type,
index c4857083d834e899f3f3d701fa43b65555c9283b..6b8bb1b070cca3078e76c48b3f08be4bfd6021ff 100644 (file)
@@ -161,9 +161,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
                return r;
        }
 
-       if (ring->funcs->init_cond_exec)
-               patch_offset = amdgpu_ring_init_cond_exec(ring);
-
        if (vm) {
                r = amdgpu_vm_flush(ring, job);
                if (r) {
@@ -172,7 +169,10 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
                }
        }
 
-       if (ring->funcs->emit_hdp_flush
+       if (ring->funcs->init_cond_exec)
+               patch_offset = amdgpu_ring_init_cond_exec(ring);
+
+               if (ring->funcs->emit_hdp_flush
 #ifdef CONFIG_X86_64
            && !(adev->flags & AMD_IS_APU)
 #endif
index 10e8232d6cacb342d983fddab0fa98faa32abc4e..72bef223a080b95f18a67c357f4ab13dcd5994f0 100644 (file)
@@ -577,42 +577,59 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
                id->oa_size != job->oa_size);
        int r;
 
-       if (ring->funcs->emit_pipeline_sync && (
-           job->vm_needs_flush || gds_switch_needed ||
-           amdgpu_vm_ring_has_compute_vm_bug(ring)))
-               amdgpu_ring_emit_pipeline_sync(ring);
+       if (job->vm_needs_flush || gds_switch_needed ||
+               amdgpu_vm_is_gpu_reset(adev, id) ||
+               amdgpu_vm_ring_has_compute_vm_bug(ring)) {
+               unsigned patch_offset = 0;
 
-       if (ring->funcs->emit_vm_flush && (job->vm_needs_flush ||
-           amdgpu_vm_is_gpu_reset(adev, id))) {
-               struct dma_fence *fence;
-               u64 pd_addr = amdgpu_vm_adjust_mc_addr(adev, job->vm_pd_addr);
+               if (ring->funcs->init_cond_exec)
+                       patch_offset = amdgpu_ring_init_cond_exec(ring);
 
-               trace_amdgpu_vm_flush(pd_addr, ring->idx, job->vm_id);
-               amdgpu_ring_emit_vm_flush(ring, job->vm_id, pd_addr);
+               if (ring->funcs->emit_pipeline_sync &&
+                       (job->vm_needs_flush || gds_switch_needed ||
+                       amdgpu_vm_ring_has_compute_vm_bug(ring)))
+                       amdgpu_ring_emit_pipeline_sync(ring);
 
-               r = amdgpu_fence_emit(ring, &fence);
-               if (r)
-                       return r;
+               if (ring->funcs->emit_vm_flush && (job->vm_needs_flush ||
+                       amdgpu_vm_is_gpu_reset(adev, id))) {
+                       struct dma_fence *fence;
+                       u64 pd_addr = amdgpu_vm_adjust_mc_addr(adev, job->vm_pd_addr);
 
-               mutex_lock(&adev->vm_manager.lock);
-               dma_fence_put(id->last_flush);
-               id->last_flush = fence;
-               mutex_unlock(&adev->vm_manager.lock);
-       }
+                       trace_amdgpu_vm_flush(pd_addr, ring->idx, job->vm_id);
+                       amdgpu_ring_emit_vm_flush(ring, job->vm_id, pd_addr);
 
-       if (gds_switch_needed) {
-               id->gds_base = job->gds_base;
-               id->gds_size = job->gds_size;
-               id->gws_base = job->gws_base;
-               id->gws_size = job->gws_size;
-               id->oa_base = job->oa_base;
-               id->oa_size = job->oa_size;
-               amdgpu_ring_emit_gds_switch(ring, job->vm_id,
-                                           job->gds_base, job->gds_size,
-                                           job->gws_base, job->gws_size,
-                                           job->oa_base, job->oa_size);
-       }
+                       r = amdgpu_fence_emit(ring, &fence);
+                       if (r)
+                               return r;
 
+                       mutex_lock(&adev->vm_manager.lock);
+                       dma_fence_put(id->last_flush);
+                       id->last_flush = fence;
+                       mutex_unlock(&adev->vm_manager.lock);
+               }
+
+               if (gds_switch_needed) {
+                       id->gds_base = job->gds_base;
+                       id->gds_size = job->gds_size;
+                       id->gws_base = job->gws_base;
+                       id->gws_size = job->gws_size;
+                       id->oa_base = job->oa_base;
+                       id->oa_size = job->oa_size;
+                       amdgpu_ring_emit_gds_switch(ring, job->vm_id,
+                                                       job->gds_base, job->gds_size,
+                                                       job->gws_base, job->gws_size,
+                                                       job->oa_base, job->oa_size);
+               }
+
+               if (ring->funcs->patch_cond_exec)
+                       amdgpu_ring_patch_cond_exec(ring, patch_offset);
+
+               /* the double SWITCH_BUFFER here *cannot* be skipped by COND_EXEC */
+               if (ring->funcs->emit_switch_buffer) {
+                       amdgpu_ring_emit_switch_buffer(ring);
+                       amdgpu_ring_emit_switch_buffer(ring);
+               }
+       }
        return 0;
 }
 
index c59bb38c091f438c7e6478fad91f35bd54605300..e0fa0d30e162a0fbe4fa24821b4b0765920ba5a4 100644 (file)
@@ -6675,8 +6675,6 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
                /* sync PFP to ME, otherwise we might get invalid PFP reads */
                amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
                amdgpu_ring_write(ring, 0x0);
-               /* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */
-               amdgpu_ring_insert_nop(ring, 128);
        }
 }
 
@@ -7078,15 +7076,24 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
        .get_rptr = gfx_v8_0_ring_get_rptr,
        .get_wptr = gfx_v8_0_ring_get_wptr_gfx,
        .set_wptr = gfx_v8_0_ring_set_wptr_gfx,
-       .emit_frame_size =
-               20 + /* gfx_v8_0_ring_emit_gds_switch */
-               7 + /* gfx_v8_0_ring_emit_hdp_flush */
-               5 + /* gfx_v8_0_ring_emit_hdp_invalidate */
-               6 + 6 + 6 +/* gfx_v8_0_ring_emit_fence_gfx x3 for user fence, vm fence */
-               7 + /* gfx_v8_0_ring_emit_pipeline_sync */
-               128 + 19 + /* gfx_v8_0_ring_emit_vm_flush */
-               2 + /* gfx_v8_ring_emit_sb */
-               3 + 4 + 29, /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */
+       .emit_frame_size = /* maximum 215dw if count 16 IBs in */
+               5 +  /* COND_EXEC */
+               7 +  /* PIPELINE_SYNC */
+               19 + /* VM_FLUSH */
+               8 +  /* FENCE for VM_FLUSH */
+               20 + /* GDS switch */
+               4 + /* double SWITCH_BUFFER,
+                      the first COND_EXEC jump to the place just
+                          prior to this double SWITCH_BUFFER  */
+               5 + /* COND_EXEC */
+               7 +      /*     HDP_flush */
+               4 +      /*     VGT_flush */
+               14 + /* CE_META */
+               31 + /* DE_META */
+               3 + /* CNTX_CTRL */
+               5 + /* HDP_INVL */
+               8 + 8 + /* FENCE x2 */
+               2, /* SWITCH_BUFFER */
        .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_gfx */
        .emit_ib = gfx_v8_0_ring_emit_ib_gfx,
        .emit_fence = gfx_v8_0_ring_emit_fence_gfx,
index f124f6d3b86916ac77c46bc7a842c19069f7891d..7666add21519d592fea972a80c47b16e9850903e 100644 (file)
@@ -3186,8 +3186,6 @@ static void gfx_v9_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
                /* sync PFP to ME, otherwise we might get invalid PFP reads */
                amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
                amdgpu_ring_write(ring, 0x0);
-               /* Emits 128 dw nop to prevent CE access VM before vm_flush finish */
-               amdgpu_ring_insert_nop(ring, 128);
        }
 }
 
@@ -3682,15 +3680,24 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
        .get_rptr = gfx_v9_0_ring_get_rptr_gfx,
        .get_wptr = gfx_v9_0_ring_get_wptr_gfx,
        .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
-       .emit_frame_size =
-               20 + /* gfx_v9_0_ring_emit_gds_switch */
-               7 + /* gfx_v9_0_ring_emit_hdp_flush */
-               5 + /* gfx_v9_0_ring_emit_hdp_invalidate */
-               8 + 8 + 8 +/* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
-               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
-               128 + 66 + /* gfx_v9_0_ring_emit_vm_flush */
-               2 + /* gfx_v9_ring_emit_sb */
-               3, /* gfx_v9_ring_emit_cntxcntl */
+       .emit_frame_size = /* totally 242 maximum if 16 IBs */
+               5 +  /* COND_EXEC */
+               7 +  /* PIPELINE_SYNC */
+               46 + /* VM_FLUSH */
+               8 +  /* FENCE for VM_FLUSH */
+               20 + /* GDS switch */
+               4 + /* double SWITCH_BUFFER,
+                      the first COND_EXEC jump to the place just
+                          prior to this double SWITCH_BUFFER  */
+               5 + /* COND_EXEC */
+               7 +      /*     HDP_flush */
+               4 +      /*     VGT_flush */
+               14 + /* CE_META */
+               31 + /* DE_META */
+               3 + /* CNTX_CTRL */
+               5 + /* HDP_INVL */
+               8 + 8 + /* FENCE x2 */
+               2, /* SWITCH_BUFFER */
        .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_gfx */
        .emit_ib = gfx_v9_0_ring_emit_ib_gfx,
        .emit_fence = gfx_v9_0_ring_emit_fence,