drm/amdgpu: throttle buffer migrations at CS using a fixed MBps limit (v2)

author Marek Olšák <marek.olsak@amd.com>

Wed, 17 Aug 2016 21:49:27 +0000 (23:49 +0200)

committer Alex Deucher <alexander.deucher@amd.com>

Tue, 30 Aug 2016 21:54:30 +0000 (17:54 -0400)
author Marek Olšák <marek.olsak@amd.com>
Wed, 17 Aug 2016 21:49:27 +0000 (23:49 +0200)
committer Alex Deucher <alexander.deucher@amd.com>
Tue, 30 Aug 2016 21:54:30 +0000 (17:54 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 4cfcf9c378005f824a909d1776802399fada708b..938ef1cb68ccce56634a5873f706805bdc09a924 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -64,6 +64,7 @@
  extern int amdgpu_modeset;
  extern int amdgpu_vram_limit;
  extern int amdgpu_gart_size;
+extern int amdgpu_moverate;
  extern int amdgpu_benchmarking;
  extern int amdgpu_testing;
  extern int amdgpu_audio;
@@ -2034,6 +2035,14 @@ struct amdgpu_device {
         atomic64_t                      num_evictions;
         atomic_t                        gpu_reset_counter;
  
+       /* data for buffer migration throttling */
+       struct {
+               spinlock_t              lock;
+               s64                     last_update_us;
+               s64                     accum_us; /* accumulated microseconds */
+               u32                     log2_max_MBps;
+       } mm_stats;
+
         /* display */
         bool                            enable_virtual_display;
         struct amdgpu_mode_info         mode_info;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

index d80e5d3a4add2e94dadd78d2455ba533b4076fd4..82927570333a2fd7c24147b4a6b0bc0c59ee24dd 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -235,56 +235,115 @@ free_chunk:
         return ret;
  }
  
-/* Returns how many bytes TTM can move per IB.
+/* Convert microseconds to bytes. */
+static u64 us_to_bytes(struct amdgpu_device *adev, s64 us)
+{
+       if (us <= 0 || !adev->mm_stats.log2_max_MBps)
+               return 0;
+
+       /* Since accum_us is incremented by a million per second, just
+        * multiply it by the number of MB/s to get the number of bytes.
+        */
+       return us << adev->mm_stats.log2_max_MBps;
+}
+
+static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes)
+{
+       if (!adev->mm_stats.log2_max_MBps)
+               return 0;
+
+       return bytes >> adev->mm_stats.log2_max_MBps;
+}
+
+/* Returns how many bytes TTM can move right now. If no bytes can be moved,
+ * it returns 0. If it returns non-zero, it's OK to move at least one buffer,
+ * which means it can go over the threshold once. If that happens, the driver
+ * will be in debt and no other buffer migrations can be done until that debt
+ * is repaid.
+ *
+ * This approach allows moving a buffer of any size (it's important to allow
+ * that).
+ *
+ * The currency is simply time in microseconds and it increases as the clock
+ * ticks. The accumulated microseconds (us) are converted to bytes and
+ * returned.
   */
  static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)
  {
-       u64 real_vram_size = adev->mc.real_vram_size;
-       u64 vram_usage = atomic64_read(&adev->vram_usage);
+       s64 time_us, increment_us;
+       u64 max_bytes;
+       u64 free_vram, total_vram, used_vram;
  
-       /* This function is based on the current VRAM usage.
+       /* Allow a maximum of 200 accumulated ms. This is basically per-IB
+        * throttling.
          *
-        * - If all of VRAM is free, allow relocating the number of bytes that
-        *   is equal to 1/4 of the size of VRAM for this IB.
+        * It means that in order to get full max MBps, at least 5 IBs per
+        * second must be submitted and not more than 200ms apart from each
+        * other.
+        */
+       const s64 us_upper_bound = 200000;
  
-        * - If more than one half of VRAM is occupied, only allow relocating
-        *   1 MB of data for this IB.
-        *
-        * - From 0 to one half of used VRAM, the threshold decreases
-        *   linearly.
-        *         __________________
-        * 1/4 of -|\               |
-        * VRAM    | \              |
-        *         |  \             |
-        *         |   \            |
-        *         |    \           |
-        *         |     \          |
-        *         |      \         |
-        *         |       \________|1 MB
-        *         |----------------|
-        *    VRAM 0 %             100 %
-        *         used            used
-        *
-        * Note: It's a threshold, not a limit. The threshold must be crossed
-        * for buffer relocations to stop, so any buffer of an arbitrary size
-        * can be moved as long as the threshold isn't crossed before
-        * the relocation takes place. We don't want to disable buffer
-        * relocations completely.
+       if (!adev->mm_stats.log2_max_MBps)
+               return 0;
+
+       total_vram = adev->mc.real_vram_size - adev->vram_pin_size;
+       used_vram = atomic64_read(&adev->vram_usage);
+       free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
+
+       spin_lock(&adev->mm_stats.lock);
+
+       /* Increase the amount of accumulated us. */
+       time_us = ktime_to_us(ktime_get());
+       increment_us = time_us - adev->mm_stats.last_update_us;
+       adev->mm_stats.last_update_us = time_us;
+       adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us,
+                                      us_upper_bound);
+
+       /* This prevents the short period of low performance when the VRAM
+        * usage is low and the driver is in debt or doesn't have enough
+        * accumulated us to fill VRAM quickly.
          *
-        * The idea is that buffers should be placed in VRAM at creation time
-        * and TTM should only do a minimum number of relocations during
-        * command submission. In practice, you need to submit at least
-        * a dozen IBs to move all buffers to VRAM if they are in GTT.
+        * The situation can occur in these cases:
+        * - a lot of VRAM is freed by userspace
+        * - the presence of a big buffer causes a lot of evictions
+        *   (solution: split buffers into smaller ones)
          *
-        * Also, things can get pretty crazy under memory pressure and actual
-        * VRAM usage can change a lot, so playing safe even at 50% does
-        * consistently increase performance.
+        * If 128 MB or 1/8th of VRAM is free, start filling it now by setting
+        * accum_us to a positive number.
          */
+       if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) {
+               s64 min_us;
+
+               /* Be more aggresive on dGPUs. Try to fill a portion of free
+                * VRAM now.
+                */
+               if (!(adev->flags & AMD_IS_APU))
+                       min_us = bytes_to_us(adev, free_vram / 4);
+               else
+                       min_us = 0; /* Reset accum_us on APUs. */
+
+               adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us);
+       }
  
-       u64 half_vram = real_vram_size >> 1;
-       u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage;
-       u64 bytes_moved_threshold = half_free_vram >> 1;
-       return max(bytes_moved_threshold, 1024*1024ull);
+       /* This returns 0 if the driver is in debt to disallow (optional)
+        * buffer moves.
+        */
+       max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us);
+
+       spin_unlock(&adev->mm_stats.lock);
+       return max_bytes;
+}
+
+/* Report how many bytes have really been moved for the last command
+ * submission. This can result in a debt that can stop buffer migrations
+ * temporarily.
+ */
+static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev,
+                                        u64 num_bytes)
+{
+       spin_lock(&adev->mm_stats.lock);
+       adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes);
+       spin_unlock(&adev->mm_stats.lock);
  }
  
  static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
@@ -297,15 +356,10 @@ static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p,
         if (bo->pin_count)
                 return 0;
  
-       /* Avoid moving this one if we have moved too many buffers
-        * for this IB already.
-        *
-        * Note that this allows moving at least one buffer of
-        * any size, because it doesn't take the current "bo"
-        * into account. We don't want to disallow buffer moves
-        * completely.
+       /* Don't move this buffer if we have depleted our allowance
+        * to move it. Don't move anything if the threshold is zero.
          */
-       if (p->bytes_moved <= p->bytes_moved_threshold)
+       if (p->bytes_moved < p->bytes_moved_threshold)
                 domain = bo->prefered_domains;
         else
                 domain = bo->allowed_domains;
@@ -494,6 +548,8 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
                 goto error_validate;
         }
  
+       amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved);
+
         fpriv->vm.last_eviction_counter =
                 atomic64_read(&p->adev->num_evictions);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 1ef4034b3be539b9154b916521003c482806948b..847583d8a3b3c10f50222e1e658b8141d6a134e6 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1490,6 +1490,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
  {
         int r, i;
         bool runtime = false;
+       u32 max_MBps;
  
         adev->shutdown = false;
         adev->dev = &pdev->dev;
@@ -1549,6 +1550,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
         spin_lock_init(&adev->didt_idx_lock);
         spin_lock_init(&adev->gc_cac_idx_lock);
         spin_lock_init(&adev->audio_endpt_idx_lock);
+       spin_lock_init(&adev->mm_stats.lock);
  
         INIT_LIST_HEAD(&adev->shadow_list);
         mutex_init(&adev->shadow_list_lock);
@@ -1660,6 +1662,14 @@ int amdgpu_device_init(struct amdgpu_device *adev,
  
         adev->accel_working = true;
  
+       /* Initialize the buffer migration limit. */
+       if (amdgpu_moverate >= 0)
+               max_MBps = amdgpu_moverate;
+       else
+               max_MBps = 8; /* Allow 8 MB/s. */
+       /* Get a log2 for easy divisions. */
+       adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
+
         amdgpu_fbdev_init(adev);
  
         r = amdgpu_ib_pool_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index 1b787d974515f075c730ab42083d42e96611f159..6fed75454800d8f0cdac0faf1bed66b4455f1281 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -62,6 +62,7 @@
  
  int amdgpu_vram_limit = 0;
  int amdgpu_gart_size = -1; /* auto */
+int amdgpu_moverate = -1; /* auto */
  int amdgpu_benchmarking = 0;
  int amdgpu_testing = 0;
  int amdgpu_audio = -1;
@@ -100,6 +101,9 @@ module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
  MODULE_PARM_DESC(gartsize, "Size of PCIE/IGP gart to setup in megabytes (32, 64, etc., -1 = auto)");
  module_param_named(gartsize, amdgpu_gart_size, int, 0600);
  
+MODULE_PARM_DESC(moverate, "Maximum buffer migration rate in MB/s. (32, 64, etc., -1=auto, 0=1=disabled)");
+module_param_named(moverate, amdgpu_moverate, int, 0600);
+
  MODULE_PARM_DESC(benchmark, "Run benchmark");
  module_param_named(benchmark, amdgpu_benchmarking, int, 0444);
author	Marek Olšák <marek.olsak@amd.com>
	Wed, 17 Aug 2016 21:49:27 +0000 (23:49 +0200)
committer	Alex Deucher <alexander.deucher@amd.com>
	Tue, 30 Aug 2016 21:54:30 +0000 (17:54 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c		patch \| blob \| blame \| history