From 52a42cec4b7088599a9f51187c454d45c460167a Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 18 Aug 2016 17:17:18 +0100 Subject: [PATCH] drm/i915/cmdparser: Accelerate copies from WC memory If we need to use clflush to prepare our batch for reads from memory, we can bypass the cache instead by using non-temporal copies. Signed-off-by: Chris Wilson Reviewed-by: Matthew Auld Link: http://patchwork.freedesktop.org/patch/msgid/20160818161718.27187-39-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/i915_cmd_parser.c | 70 ++++++++++++++++---------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c index e128e3ab8452..3c72b3b103e7 100644 --- a/drivers/gpu/drm/i915/i915_cmd_parser.c +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c @@ -965,8 +965,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, { unsigned int src_needs_clflush; unsigned int dst_needs_clflush; - void *dst, *ptr; - int offset, n; + void *dst, *src; int ret; ret = i915_gem_obj_prepare_shmem_read(src_obj, &src_needs_clflush); @@ -983,31 +982,48 @@ static u32 *copy_batch(struct drm_i915_gem_object *dst_obj, if (IS_ERR(dst)) goto unpin_dst; - ptr = dst; - offset = offset_in_page(batch_start_offset); - - /* We can avoid clflushing partial cachelines before the write if we - * only every write full cache-lines. Since we know that both the - * source and destination are in multiples of PAGE_SIZE, we can simply - * round up to the next cacheline. We don't care about copying too much - * here as we only validate up to the end of the batch. - */ - if (dst_needs_clflush & CLFLUSH_BEFORE) - batch_len = roundup(batch_len, boot_cpu_data.x86_clflush_size); - - for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) { - int len = min_t(int, batch_len, PAGE_SIZE - offset); - void *vaddr; - - vaddr = kmap_atomic(i915_gem_object_get_page(src_obj, n)); - if (src_needs_clflush) - drm_clflush_virt_range(vaddr + offset, len); - memcpy(ptr, vaddr + offset, len); - kunmap_atomic(vaddr); - - ptr += len; - batch_len -= len; - offset = 0; + src = ERR_PTR(-ENODEV); + if (src_needs_clflush && + i915_memcpy_from_wc((void *)(uintptr_t)batch_start_offset, 0, 0)) { + src = i915_gem_object_pin_map(src_obj, I915_MAP_WC); + if (!IS_ERR(src)) { + i915_memcpy_from_wc(dst, + src + batch_start_offset, + ALIGN(batch_len, 16)); + i915_gem_object_unpin_map(src_obj); + } + } + if (IS_ERR(src)) { + void *ptr; + int offset, n; + + offset = offset_in_page(batch_start_offset); + + /* We can avoid clflushing partial cachelines before the write + * if we only every write full cache-lines. Since we know that + * both the source and destination are in multiples of + * PAGE_SIZE, we can simply round up to the next cacheline. + * We don't care about copying too much here as we only + * validate up to the end of the batch. + */ + if (dst_needs_clflush & CLFLUSH_BEFORE) + batch_len = roundup(batch_len, + boot_cpu_data.x86_clflush_size); + + ptr = dst; + for (n = batch_start_offset >> PAGE_SHIFT; batch_len; n++) { + int len = min_t(int, batch_len, PAGE_SIZE - offset); + + src = kmap_atomic(i915_gem_object_get_page(src_obj, n)); + if (src_needs_clflush) + drm_clflush_virt_range(src + offset, len); + memcpy(ptr, src + offset, len); + kunmap_atomic(src); + + ptr += len; + batch_len -= len; + offset = 0; + } } /* dst_obj is returned with vmap pinned */ -- 2.20.1