drm/amdkfd: Enforce kill all waves on process termination
authorBen Goz <ben.goz@amd.com>
Wed, 20 May 2015 15:05:44 +0000 (18:05 +0300)
committerOded Gabbay <oded.gabbay@gmail.com>
Wed, 3 Jun 2015 08:34:47 +0000 (11:34 +0300)
This commit makes sure that on process termination, after
we're destroying all the active queues, we're killing all the
existing wave front of the current process.

By doing this we're making sure that if any of the CUs were blocked
by infinite loop we're enforcing it to end the shader explicitly.

Signed-off-by: Ben Goz <ben.goz@amd.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_process.c

index 00d8fcfa17ea051f2d10fad4f656c1b0a30ef76f..96153f28d73fa0896ff1676996ef7446d7e80ebc 100644 (file)
@@ -792,6 +792,71 @@ static int dbgdev_wave_control_nodiq(struct kfd_dbgdev *dbgdev,
                                                        reg_sq_cmd.u32All);
 }
 
+int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p)
+{
+       int status = 0;
+       unsigned int vmid;
+       union SQ_CMD_BITS reg_sq_cmd;
+       union GRBM_GFX_INDEX_BITS reg_gfx_index;
+       struct kfd_process_device *pdd;
+       struct dbg_wave_control_info wac_info;
+       int temp;
+       int first_vmid_to_scan = 8;
+       int last_vmid_to_scan = 15;
+
+       first_vmid_to_scan = ffs(dev->shared_resources.compute_vmid_bitmap) - 1;
+       temp = dev->shared_resources.compute_vmid_bitmap >> first_vmid_to_scan;
+       last_vmid_to_scan = first_vmid_to_scan + ffz(temp);
+
+       reg_sq_cmd.u32All = 0;
+       status = 0;
+
+       wac_info.mode = HSA_DBG_WAVEMODE_BROADCAST_PROCESS;
+       wac_info.operand = HSA_DBG_WAVEOP_KILL;
+
+       pr_debug("Killing all process wavefronts\n");
+
+       /* Scan all registers in the range ATC_VMID8_PASID_MAPPING ..
+        * ATC_VMID15_PASID_MAPPING
+        * to check which VMID the current process is mapped to. */
+
+       for (vmid = first_vmid_to_scan; vmid <= last_vmid_to_scan; vmid++) {
+               if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
+                               (dev->kgd, vmid)) {
+                       if (dev->kfd2kgd->get_atc_vmid_pasid_mapping_valid
+                                       (dev->kgd, vmid) == p->pasid) {
+                               pr_debug("Killing wave fronts of vmid %d and pasid %d\n",
+                                               vmid, p->pasid);
+                               break;
+                       }
+               }
+       }
+
+       if (vmid > last_vmid_to_scan) {
+               pr_err("amdkfd: didn't found vmid for pasid (%d)\n", p->pasid);
+               return -EFAULT;
+       }
+
+       /* taking the VMID for that process on the safe way using PDD */
+       pdd = kfd_get_process_device_data(dev, p);
+       if (!pdd)
+               return -EFAULT;
+
+       status = dbgdev_wave_control_set_registers(&wac_info, &reg_sq_cmd,
+                       &reg_gfx_index);
+       if (status != 0)
+               return -EINVAL;
+
+       /* for non DIQ we need to patch the VMID: */
+       reg_sq_cmd.bits.vm_id = vmid;
+
+       dev->kfd2kgd->wave_control_execute(dev->kgd,
+                                       reg_gfx_index.u32All,
+                                       reg_sq_cmd.u32All);
+
+       return 0;
+}
+
 void kfd_dbgdev_init(struct kfd_dbgdev *pdbgdev, struct kfd_dev *pdev,
                        enum DBGDEV_TYPE type)
 {
index b08ec05658fe6bb8b74d13d88f8608b80ef6c8f0..547b0a589693615b6da7ae7d5ea87435ee71313f 100644 (file)
@@ -946,6 +946,7 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm,
 {
        int retval;
        enum kfd_preempt_type_filter preempt_type;
+       struct kfd_process *p;
 
        BUG_ON(!dqm);
 
@@ -977,8 +978,13 @@ static int destroy_queues_cpsch(struct device_queue_manager *dqm,
        pm_send_query_status(&dqm->packets, dqm->fence_gpu_addr,
                                KFD_FENCE_COMPLETED);
        /* should be timed out */
-       amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
+       retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED,
                                QUEUE_PREEMPT_DEFAULT_TIMEOUT_MS);
+       if (retval != 0) {
+               p = kfd_get_process(current);
+               p->reset_wavefronts = true;
+               goto out;
+       }
        pm_release_ib(&dqm->packets);
        dqm->active_runlist = false;
 
index cd1f033c7488377a01c5ae56cd28c16537e033ea..cb79046e5c8007050ca1eab89f0754cb7cce7acf 100644 (file)
@@ -519,6 +519,11 @@ struct kfd_process {
                                                                event_pages */
        u32 next_nonsignal_event_id;
        size_t signal_event_count;
+       /*
+        * This flag tells if we should reset all wavefronts on
+        * process termination
+        */
+       bool reset_wavefronts;
 };
 
 /**
@@ -726,4 +731,6 @@ int kfd_event_create(struct file *devkfd, struct kfd_process *p,
                     uint64_t *event_page_offset, uint32_t *event_slot_index);
 int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
 
+int dbgdev_wave_reset_wavefronts(struct kfd_dev *dev, struct kfd_process *p);
+
 #endif
index dc910af2bb3c45aa6e344a86e16d9ba4ae389713..56b904f5bdb19352cb0c7d15fa26669e3e4a97aa 100644 (file)
@@ -31,6 +31,7 @@
 struct mm_struct;
 
 #include "kfd_priv.h"
+#include "kfd_dbgmgr.h"
 
 /*
  * Initial size for the array of queues.
@@ -172,6 +173,9 @@ static void kfd_process_wq_release(struct work_struct *work)
                pr_debug("Releasing pdd (topology id %d) for process (pasid %d) in workqueue\n",
                                pdd->dev->id, p->pasid);
 
+               if (p->reset_wavefronts)
+                       dbgdev_wave_reset_wavefronts(pdd->dev, p);
+
                amd_iommu_unbind_pasid(pdd->dev->pdev, p->pasid);
                list_del(&pdd->per_device_list);
 
@@ -301,6 +305,8 @@ static struct kfd_process *create_process(const struct task_struct *thread)
        if (kfd_init_apertures(process) != 0)
                goto err_init_apretures;
 
+       process->reset_wavefronts = false;
+
        return process;
 
 err_init_apretures:
@@ -399,7 +405,12 @@ void kfd_unbind_process_from_device(struct kfd_dev *dev, unsigned int pasid)
 
        mutex_lock(&p->mutex);
 
+       if ((dev->dbgmgr) && (dev->dbgmgr->pasid == p->pasid))
+               kfd_dbgmgr_destroy(dev->dbgmgr);
+
        pqm_uninit(&p->pqm);
+       if (p->reset_wavefronts)
+               dbgdev_wave_reset_wavefronts(dev, p);
 
        pdd = kfd_get_process_device_data(dev, p);