drm/i915: Reuse the active golden render state batch
authorChris Wilson <chris@chris-wilson.co.uk>
Fri, 28 Oct 2016 12:58:31 +0000 (13:58 +0100)
committerChris Wilson <chris@chris-wilson.co.uk>
Fri, 28 Oct 2016 19:53:44 +0000 (20:53 +0100)
The golden render state is constant, but we recreate the batch setting
it up for every new context. If we keep that batch in a volatile cache
we can safely reuse it whenever we need to initialise a new context. We
mark the pages as purgeable and use the shrinker to recover pages from
the batch whenever we face memory pressues, recreating that batch afresh
on the next new context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtien@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-8-chris@chris-wilson.co.uk
drivers/gpu/drm/i915/i915_gem_render_state.c
drivers/gpu/drm/i915/i915_gem_render_state.h
drivers/gpu/drm/i915/intel_engine_cs.c
drivers/gpu/drm/i915/intel_lrc.c
drivers/gpu/drm/i915/intel_ringbuffer.c
drivers/gpu/drm/i915/intel_ringbuffer.h

index 217e0b58b9303db75e10b52e5567048610bcdf1f..9625e1a662ed90bdb284557d14f19a40b1b10bfe 100644 (file)
 #include "i915_drv.h"
 #include "intel_renderstate.h"
 
-struct render_state {
+struct intel_render_state {
        const struct intel_renderstate_rodata *rodata;
        struct i915_vma *vma;
-       u32 aux_batch_size;
-       u32 aux_batch_offset;
+       u32 batch_offset;
+       u32 batch_size;
+       u32 aux_offset;
+       u32 aux_size;
 };
 
 static const struct intel_renderstate_rodata *
-render_state_get_rodata(const struct drm_i915_gem_request *req)
+render_state_get_rodata(const struct intel_engine_cs *engine)
 {
-       switch (INTEL_GEN(req->i915)) {
+       switch (INTEL_GEN(engine->i915)) {
        case 6:
                return &gen6_null_state;
        case 7:
@@ -63,29 +65,27 @@ render_state_get_rodata(const struct drm_i915_gem_request *req)
  */
 #define OUT_BATCH(batch, i, val)                               \
        do {                                                    \
-               if (WARN_ON((i) >= PAGE_SIZE / sizeof(u32))) {  \
-                       ret = -ENOSPC;                          \
-                       goto err_out;                           \
-               }                                               \
+               if ((i) >= PAGE_SIZE / sizeof(u32))             \
+                       goto err;                               \
                (batch)[(i)++] = (val);                         \
        } while(0)
 
-static int render_state_setup(struct render_state *so)
+static int render_state_setup(struct intel_render_state *so,
+                             struct drm_i915_private *i915)
 {
-       struct drm_i915_private *dev_priv = to_i915(so->vma->vm->dev);
        const struct intel_renderstate_rodata *rodata = so->rodata;
-       const bool has_64bit_reloc = INTEL_GEN(dev_priv) >= 8;
+       const bool has_64bit_reloc = INTEL_GEN(i915) >= 8;
+       struct drm_i915_gem_object *obj = so->vma->obj;
        unsigned int i = 0, reloc_index = 0;
-       struct page *page;
+       unsigned int needs_clflush;
        u32 *d;
        int ret;
 
-       ret = i915_gem_object_set_to_cpu_domain(so->vma->obj, true);
+       ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
        if (ret)
                return ret;
 
-       page = i915_gem_object_get_dirty_page(so->vma->obj, 0);
-       d = kmap(page);
+       d = kmap_atomic(i915_gem_object_get_dirty_page(obj, 0));
 
        while (i < rodata->batch_items) {
                u32 s = rodata->batch[i];
@@ -95,10 +95,8 @@ static int render_state_setup(struct render_state *so)
                        s = lower_32_bits(r);
                        if (has_64bit_reloc) {
                                if (i + 1 >= rodata->batch_items ||
-                                   rodata->batch[i + 1] != 0) {
-                                       ret = -EINVAL;
-                                       goto err_out;
-                               }
+                                   rodata->batch[i + 1] != 0)
+                                       goto err;
 
                                d[i++] = s;
                                s = upper_32_bits(r);
@@ -110,12 +108,20 @@ static int render_state_setup(struct render_state *so)
                d[i++] = s;
        }
 
+       if (rodata->reloc[reloc_index] != -1) {
+               DRM_ERROR("only %d relocs resolved\n", reloc_index);
+               goto err;
+       }
+
+       so->batch_offset = so->vma->node.start;
+       so->batch_size = rodata->batch_items * sizeof(u32);
+
        while (i % CACHELINE_DWORDS)
                OUT_BATCH(d, i, MI_NOOP);
 
-       so->aux_batch_offset = i * sizeof(u32);
+       so->aux_offset = i * sizeof(u32);
 
-       if (HAS_POOLED_EU(dev_priv)) {
+       if (HAS_POOLED_EU(i915)) {
                /*
                 * We always program 3x6 pool config but depending upon which
                 * subslice is disabled HW drops down to appropriate config
@@ -143,89 +149,131 @@ static int render_state_setup(struct render_state *so)
        }
 
        OUT_BATCH(d, i, MI_BATCH_BUFFER_END);
-       so->aux_batch_size = (i * sizeof(u32)) - so->aux_batch_offset;
-
+       so->aux_size = i * sizeof(u32) - so->aux_offset;
+       so->aux_offset += so->batch_offset;
        /*
         * Since we are sending length, we need to strictly conform to
         * all requirements. For Gen2 this must be a multiple of 8.
         */
-       so->aux_batch_size = ALIGN(so->aux_batch_size, 8);
-
-       kunmap(page);
-
-       ret = i915_gem_object_set_to_gtt_domain(so->vma->obj, false);
-       if (ret)
-               return ret;
-
-       if (rodata->reloc[reloc_index] != -1) {
-               DRM_ERROR("only %d relocs resolved\n", reloc_index);
-               return -EINVAL;
-       }
+       so->aux_size = ALIGN(so->aux_size, 8);
 
-       return 0;
+       if (needs_clflush)
+               drm_clflush_virt_range(d, i * sizeof(u32));
+       kunmap_atomic(d);
 
-err_out:
-       kunmap(page);
+       ret = i915_gem_object_set_to_gtt_domain(obj, false);
+out:
+       i915_gem_obj_finish_shmem_access(obj);
        return ret;
+
+err:
+       kunmap_atomic(d);
+       ret = -EINVAL;
+       goto out;
 }
 
 #undef OUT_BATCH
 
-int i915_gem_render_state_init(struct drm_i915_gem_request *req)
+int i915_gem_render_state_init(struct intel_engine_cs *engine)
 {
-       struct render_state so;
+       struct intel_render_state *so;
+       const struct intel_renderstate_rodata *rodata;
        struct drm_i915_gem_object *obj;
        int ret;
 
-       if (WARN_ON(req->engine->id != RCS))
-               return -ENOENT;
+       if (engine->id != RCS)
+               return 0;
 
-       so.rodata = render_state_get_rodata(req);
-       if (!so.rodata)
+       rodata = render_state_get_rodata(engine);
+       if (!rodata)
                return 0;
 
-       if (so.rodata->batch_items * 4 > 4096)
+       if (rodata->batch_items * 4 > 4096)
                return -EINVAL;
 
-       obj = i915_gem_object_create_internal(req->i915, 4096);
-       if (IS_ERR(obj))
-               return PTR_ERR(obj);
+       so = kmalloc(sizeof(*so), GFP_KERNEL);
+       if (!so)
+               return -ENOMEM;
 
-       so.vma = i915_vma_create(obj, &req->i915->ggtt.base, NULL);
-       if (IS_ERR(so.vma)) {
-               ret = PTR_ERR(so.vma);
-               goto err_obj;
+       obj = i915_gem_object_create_internal(engine->i915, 4096);
+       if (IS_ERR(obj)) {
+               ret = PTR_ERR(obj);
+               goto err_free;
        }
 
-       ret = i915_vma_pin(so.vma, 0, 0, PIN_GLOBAL);
-       if (ret)
+       so->vma = i915_vma_create(obj, &engine->i915->ggtt.base, NULL);
+       if (IS_ERR(so->vma)) {
+               ret = PTR_ERR(so->vma);
                goto err_obj;
+       }
+
+       so->rodata = rodata;
+       engine->render_state = so;
+       return 0;
 
-       ret = render_state_setup(&so);
+err_obj:
+       i915_gem_object_put(obj);
+err_free:
+       kfree(so);
+       return ret;
+}
+
+int i915_gem_render_state_emit(struct drm_i915_gem_request *req)
+{
+       struct intel_render_state *so;
+       int ret;
+
+       so = req->engine->render_state;
+       if (!so)
+               return 0;
+
+       /* Recreate the page after shrinking */
+       if (!so->vma->obj->pages)
+               so->batch_offset = -1;
+
+       ret = i915_vma_pin(so->vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
        if (ret)
-               goto err_unpin;
+               return ret;
 
-       ret = req->engine->emit_bb_start(req, so.vma->node.start,
-                                        so.rodata->batch_items * 4,
+       if (so->vma->node.start != so->batch_offset) {
+               ret = render_state_setup(so, req->i915);
+               if (ret)
+                       goto err_unpin;
+       }
+
+       ret = req->engine->emit_bb_start(req,
+                                        so->batch_offset, so->batch_size,
                                         I915_DISPATCH_SECURE);
        if (ret)
                goto err_unpin;
 
-       if (so.aux_batch_size > 8) {
+       if (so->aux_size > 8) {
                ret = req->engine->emit_bb_start(req,
-                                                (so.vma->node.start +
-                                                 so.aux_batch_offset),
-                                                so.aux_batch_size,
+                                                so->aux_offset, so->aux_size,
                                                 I915_DISPATCH_SECURE);
                if (ret)
                        goto err_unpin;
        }
 
-       i915_vma_move_to_active(so.vma, req, 0);
+       i915_vma_move_to_active(so->vma, req, 0);
 err_unpin:
-       i915_vma_unpin(so.vma);
-       i915_vma_close(so.vma);
-err_obj:
-       __i915_gem_object_release_unless_active(obj);
+       i915_vma_unpin(so->vma);
        return ret;
 }
+
+void i915_gem_render_state_fini(struct intel_engine_cs *engine)
+{
+       struct intel_render_state *so;
+       struct drm_i915_gem_object *obj;
+
+       so = fetch_and_zero(&engine->render_state);
+       if (!so)
+               return;
+
+       obj = so->vma->obj;
+
+       i915_vma_close(so->vma);
+       __i915_gem_object_release_unless_active(obj);
+
+       kfree(so);
+}
index 18cce3f06e9ce603878ad2e4ecc12f662dd3dc91..87481845799d39e9990e09bd07e319a91543a97e 100644 (file)
@@ -26,6 +26,8 @@
 
 struct drm_i915_gem_request;
 
-int i915_gem_render_state_init(struct drm_i915_gem_request *req);
+int i915_gem_render_state_init(struct intel_engine_cs *engine);
+int i915_gem_render_state_emit(struct drm_i915_gem_request *req);
+void i915_gem_render_state_fini(struct intel_engine_cs *engine);
 
 #endif /* _I915_GEM_RENDER_STATE_H_ */
index b2de371d2bf5922442b7a9dded87d7e40af415d6..fd551824adf9413062318f5d399183a52fbadad4 100644 (file)
@@ -314,6 +314,10 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
        if (ret)
                return ret;
 
+       ret = i915_gem_render_state_init(engine);
+       if (ret)
+               return ret;
+
        return 0;
 }
 
@@ -328,6 +332,7 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 {
        intel_engine_cleanup_scratch(engine);
 
+       i915_gem_render_state_fini(engine);
        intel_engine_fini_breadcrumbs(engine);
        intel_engine_cleanup_cmd_parser(engine);
        i915_gem_batch_pool_fini(&engine->batch_pool);
index bc86585b9fbba689b59fb381f8580275a3bfa39c..1c1bd30e8b2db05156f68643b2fca868d209566c 100644 (file)
@@ -1637,7 +1637,7 @@ static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
        if (ret)
                DRM_ERROR("MOCS failed to program: expect performance issues.\n");
 
-       return i915_gem_render_state_init(req);
+       return i915_gem_render_state_emit(req);
 }
 
 /**
index a15b9b5f2924804cc4091206758df56895438d85..aaa46d9ffbc13560e4f5ace4e0b84128af899bfb 100644 (file)
@@ -648,7 +648,7 @@ static int intel_rcs_ctx_init(struct drm_i915_gem_request *req)
        if (ret != 0)
                return ret;
 
-       ret = i915_gem_render_state_init(req);
+       ret = i915_gem_render_state_emit(req);
        if (ret)
                return ret;
 
index 09bb89cfb7c3612e429c4be16abbe08f09b696b4..cb6e96c6cd47d4ecd813742924613f1febc83230 100644 (file)
@@ -157,6 +157,7 @@ struct i915_ctx_workarounds {
 };
 
 struct drm_i915_gem_request;
+struct intel_render_state;
 
 struct intel_engine_cs {
        struct drm_i915_private *i915;
@@ -184,6 +185,8 @@ struct intel_engine_cs {
        unsigned int irq_shift;
        struct intel_ring *buffer;
 
+       struct intel_render_state *render_state;
+
        /* Rather than have every client wait upon all user interrupts,
         * with the herd waking after every interrupt and each doing the
         * heavyweight seqno dance, we delegate the task (of being the