drm/amdkfd: Add memory exception handling
authorAlexey Skidanov <alexey.skidanov@gmail.com>
Tue, 14 Apr 2015 15:05:49 +0000 (18:05 +0300)
committerOded Gabbay <oded.gabbay@gmail.com>
Tue, 19 May 2015 10:02:27 +0000 (13:02 +0300)
This patch adds Peripheral Page Request (PPR) failure processing
and reporting.

Bad address or pointer to a system memory block with inappropriate
read/write permission cause such PPR failure during a user queue
processing. PPR request handling is done by IOMMU driver notifying
AMDKFD module on PPR failure.

The process triggering a PPR failure will be notified by
appropriate event or SIGTERM signal will be sent to it.

v3:
- Change all bool fields in struct kfd_memory_exception_failure to
  uint32_t

Signed-off-by: Alexey Skidanov <alexey.skidanov@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
drivers/gpu/drm/amd/amdkfd/kfd_device.c
drivers/gpu/drm/amd/amdkfd/kfd_events.c
drivers/gpu/drm/amd/amdkfd/kfd_events.h
drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 4c0316957a57bb761611db739707069b4a2f2398..52cab0f53ebc6e03c194208d65c10abd39754d2e 100644 (file)
@@ -182,6 +182,32 @@ static void iommu_pasid_shutdown_callback(struct pci_dev *pdev, int pasid)
                kfd_unbind_process_from_device(dev, pasid);
 }
 
+/*
+ * This function called by IOMMU driver on PPR failure
+ */
+static int iommu_invalid_ppr_cb(struct pci_dev *pdev, int pasid,
+               unsigned long address, u16 flags)
+{
+       struct kfd_dev *dev;
+
+       dev_warn(kfd_device,
+                       "Invalid PPR device %x:%x.%x pasid %d address 0x%lX flags 0x%X",
+                       PCI_BUS_NUM(pdev->devfn),
+                       PCI_SLOT(pdev->devfn),
+                       PCI_FUNC(pdev->devfn),
+                       pasid,
+                       address,
+                       flags);
+
+       dev = kfd_device_by_pci_dev(pdev);
+       BUG_ON(dev == NULL);
+
+       kfd_signal_iommu_event(dev, pasid, address,
+                       flags & PPR_FAULT_WRITE, flags & PPR_FAULT_EXEC);
+
+       return AMD_IOMMU_INV_PRI_RSP_INVALID;
+}
+
 bool kgd2kfd_device_init(struct kfd_dev *kfd,
                         const struct kgd2kfd_shared_resources *gpu_resources)
 {
@@ -251,6 +277,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
        }
        amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
                                                iommu_pasid_shutdown_callback);
+       amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
 
        kfd->dqm = device_queue_manager_init(kfd);
        if (!kfd->dqm) {
@@ -316,6 +343,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd)
        if (kfd->init_complete) {
                kfd->dqm->ops.stop(kfd->dqm);
                amd_iommu_set_invalidate_ctx_cb(kfd->pdev, NULL);
+               amd_iommu_set_invalid_ppr_cb(kfd->pdev, NULL);
                amd_iommu_free_device(kfd->pdev);
        }
 }
@@ -335,6 +363,7 @@ int kgd2kfd_resume(struct kfd_dev *kfd)
                        return -ENXIO;
                amd_iommu_set_invalidate_ctx_cb(kfd->pdev,
                                                iommu_pasid_shutdown_callback);
+               amd_iommu_set_invalid_ppr_cb(kfd->pdev, iommu_invalid_ppr_cb);
                kfd->dqm->ops.start(kfd->dqm);
        }
 
index 23ffa966f3c2e934cebeb45e587e5ecba1e3f427..fa13d3efc616901a8504f5cf157f10596183d540 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/memory.h>
 #include "kfd_priv.h"
 #include "kfd_events.h"
+#include <linux/device.h>
 
 /*
  * A task can only be on a single wait_queue at a time, but we need to support
@@ -45,6 +46,10 @@ struct kfd_event_waiter {
 
        /* Transitions to true when the event this belongs to is signaled. */
        bool activated;
+
+       /* Event */
+       struct kfd_event *event;
+       uint32_t input_index;
 };
 
 /*
@@ -609,14 +614,17 @@ static struct kfd_event_waiter *alloc_event_waiters(uint32_t num_events)
 }
 
 static int init_event_waiter(struct kfd_process *p,
-                               struct kfd_event_waiter *waiter,
-                               uint32_t event_id)
+               struct kfd_event_waiter *waiter,
+               uint32_t event_id,
+               uint32_t input_index)
 {
        struct kfd_event *ev = lookup_event_by_id(p, event_id);
 
        if (!ev)
                return -EINVAL;
 
+       waiter->event = ev;
+       waiter->input_index = input_index;
        waiter->activated = ev->signaled;
        ev->signaled = ev->signaled && !ev->auto_reset;
 
@@ -643,6 +651,38 @@ static bool test_event_condition(bool all, uint32_t num_events,
        return activated_count == num_events;
 }
 
+/*
+ * Copy event specific data, if defined.
+ * Currently only memory exception events have additional data to copy to user
+ */
+static bool copy_signaled_event_data(uint32_t num_events,
+               struct kfd_event_waiter *event_waiters,
+               struct kfd_event_data __user *data)
+{
+       struct kfd_hsa_memory_exception_data *src;
+       struct kfd_hsa_memory_exception_data __user *dst;
+       struct kfd_event_waiter *waiter;
+       struct kfd_event *event;
+       uint32_t i;
+
+       for (i = 0; i < num_events; i++) {
+               waiter = &event_waiters[i];
+               event = waiter->event;
+               if (waiter->activated && event->type == KFD_EVENT_TYPE_MEMORY) {
+                       dst = &data[waiter->input_index].memory_exception_data;
+                       src = &event->memory_exception_data;
+                       if (copy_to_user(dst, src,
+                               sizeof(struct kfd_hsa_memory_exception_data)))
+                               return false;
+               }
+       }
+
+       return true;
+
+}
+
+
+
 static long user_timeout_to_jiffies(uint32_t user_timeout_ms)
 {
        if (user_timeout_ms == KFD_EVENT_TIMEOUT_IMMEDIATE)
@@ -672,10 +712,12 @@ static void free_waiters(uint32_t num_events, struct kfd_event_waiter *waiters)
 }
 
 int kfd_wait_on_events(struct kfd_process *p,
-                      uint32_t num_events, const uint32_t __user *event_ids,
+                      uint32_t num_events, void __user *data,
                       bool all, uint32_t user_timeout_ms,
                       enum kfd_event_wait_result *wait_result)
 {
+       struct kfd_event_data __user *events =
+                       (struct kfd_event_data __user *) data;
        uint32_t i;
        int ret = 0;
        struct kfd_event_waiter *event_waiters = NULL;
@@ -690,13 +732,14 @@ int kfd_wait_on_events(struct kfd_process *p,
        }
 
        for (i = 0; i < num_events; i++) {
-               uint32_t event_id;
+               struct kfd_event_data event_data;
 
-               ret = get_user(event_id, &event_ids[i]);
-               if (ret)
+               if (copy_from_user(&event_data, &events[i],
+                               sizeof(struct kfd_event_data)))
                        goto fail;
 
-               ret = init_event_waiter(p, &event_waiters[i], event_id);
+               ret = init_event_waiter(p, &event_waiters[i],
+                               event_data.event_id, i);
                if (ret)
                        goto fail;
        }
@@ -723,7 +766,11 @@ int kfd_wait_on_events(struct kfd_process *p,
                }
 
                if (test_event_condition(all, num_events, event_waiters)) {
-                       *wait_result = KFD_WAIT_COMPLETE;
+                       if (copy_signaled_event_data(num_events,
+                                       event_waiters, events))
+                               *wait_result = KFD_WAIT_COMPLETE;
+                       else
+                               *wait_result = KFD_WAIT_ERROR;
                        break;
                }
 
@@ -797,3 +844,95 @@ int kfd_event_mmap(struct kfd_process *p, struct vm_area_struct *vma)
        return remap_pfn_range(vma, vma->vm_start, pfn,
                        vma->vm_end - vma->vm_start, vma->vm_page_prot);
 }
+
+/*
+ * Assumes that p->event_mutex is held and of course
+ * that p is not going away (current or locked).
+ */
+static void lookup_events_by_type_and_signal(struct kfd_process *p,
+               int type, void *event_data)
+{
+       struct kfd_hsa_memory_exception_data *ev_data;
+       struct kfd_event *ev;
+       int bkt;
+       bool send_signal = true;
+
+       ev_data = (struct kfd_hsa_memory_exception_data *) event_data;
+
+       hash_for_each(p->events, bkt, ev, events)
+               if (ev->type == type) {
+                       send_signal = false;
+                       dev_dbg(kfd_device,
+                                       "Event found: id %X type %d",
+                                       ev->event_id, ev->type);
+                       set_event(ev);
+                       if (ev->type == KFD_EVENT_TYPE_MEMORY && ev_data)
+                               ev->memory_exception_data = *ev_data;
+               }
+
+       /* Send SIGTERM no event of type "type" has been found*/
+       if (send_signal) {
+               dev_warn(kfd_device,
+                       "Sending SIGTERM to HSA Process with PID %d ",
+                               p->lead_thread->pid);
+               send_sig(SIGTERM, p->lead_thread, 0);
+       }
+}
+
+void kfd_signal_iommu_event(struct kfd_dev *dev, unsigned int pasid,
+               unsigned long address, bool is_write_requested,
+               bool is_execute_requested)
+{
+       struct kfd_hsa_memory_exception_data memory_exception_data;
+       struct vm_area_struct *vma;
+
+       /*
+        * Because we are called from arbitrary context (workqueue) as opposed
+        * to process context, kfd_process could attempt to exit while we are
+        * running so the lookup function returns a locked process.
+        */
+       struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+       if (!p)
+               return; /* Presumably process exited. */
+
+       memset(&memory_exception_data, 0, sizeof(memory_exception_data));
+
+       down_read(&p->mm->mmap_sem);
+       vma = find_vma(p->mm, address);
+
+       memory_exception_data.gpu_id = dev->id;
+       memory_exception_data.va = address;
+       /* Set failure reason */
+       memory_exception_data.failure.NotPresent = 1;
+       memory_exception_data.failure.NoExecute = 0;
+       memory_exception_data.failure.ReadOnly = 0;
+       if (vma) {
+               if (vma->vm_start > address) {
+                       memory_exception_data.failure.NotPresent = 1;
+                       memory_exception_data.failure.NoExecute = 0;
+                       memory_exception_data.failure.ReadOnly = 0;
+               } else {
+                       memory_exception_data.failure.NotPresent = 0;
+                       if (is_write_requested && !(vma->vm_flags & VM_WRITE))
+                               memory_exception_data.failure.ReadOnly = 1;
+                       else
+                               memory_exception_data.failure.ReadOnly = 0;
+                       if (is_execute_requested && !(vma->vm_flags & VM_EXEC))
+                               memory_exception_data.failure.NoExecute = 1;
+                       else
+                               memory_exception_data.failure.NoExecute = 0;
+               }
+       }
+
+       up_read(&p->mm->mmap_sem);
+
+       mutex_lock(&p->event_mutex);
+
+       /* Lookup events by type and signal them */
+       lookup_events_by_type_and_signal(p, KFD_EVENT_TYPE_MEMORY,
+                       &memory_exception_data);
+
+       mutex_unlock(&p->event_mutex);
+       mutex_unlock(&p->mutex);
+}
index d9b5b381af0bf42fd48bd233b8d841798b19376b..691cf8573dc1f7a44459d5c11c88ee31c8d0fb26 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include "kfd_priv.h"
+#include <uapi/linux/kfd_ioctl.h>
 
 #define KFD_EVENT_ID_NONSIGNAL_MASK 0x80000000U
 #define KFD_FIRST_NONSIGNAL_EVENT_ID KFD_EVENT_ID_NONSIGNAL_MASK
@@ -61,6 +62,11 @@ struct kfd_event {
        struct signal_page *signal_page;
        unsigned int signal_slot_index;
        uint64_t __user *user_signal_address;
+
+       /* type specific data */
+       union {
+               struct kfd_hsa_memory_exception_data memory_exception_data;
+       };
 };
 
 #define KFD_EVENT_TIMEOUT_IMMEDIATE 0
@@ -69,6 +75,7 @@ struct kfd_event {
 /* Matching HSA_EVENTTYPE */
 #define KFD_EVENT_TYPE_SIGNAL 0
 #define KFD_EVENT_TYPE_DEBUG 5
+#define KFD_EVENT_TYPE_MEMORY 8
 
 extern void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
                                        uint32_t valid_id_bits);
index 0ff9a3daf3851159f9f029234f10ef4366f86780..35945032fff9e90fb1faab51a3bf6632d8644233 100644 (file)
@@ -693,11 +693,14 @@ void kfd_event_init_process(struct kfd_process *p);
 void kfd_event_free_process(struct kfd_process *p);
 int kfd_event_mmap(struct kfd_process *process, struct vm_area_struct *vma);
 int kfd_wait_on_events(struct kfd_process *p,
-                      uint32_t num_events, const uint32_t __user *event_ids,
+                      uint32_t num_events, void __user *data,
                       bool all, uint32_t user_timeout_ms,
                       enum kfd_event_wait_result *wait_result);
 void kfd_signal_event_interrupt(unsigned int pasid, uint32_t partial_id,
                                uint32_t valid_id_bits);
+void kfd_signal_iommu_event(struct kfd_dev *dev,
+               unsigned int pasid, unsigned long address,
+               bool is_write_requested, bool is_execute_requested);
 int kfd_set_event(struct kfd_process *p, uint32_t event_id);
 int kfd_reset_event(struct kfd_process *p, uint32_t event_id);
 int kfd_event_create(struct file *devkfd, struct kfd_process *p,