drm/i915/gvt/kvmgt: add vfio/mdev support to KVMGT
authorJike Song <jike.song@intel.com>
Thu, 8 Dec 2016 03:00:36 +0000 (11:00 +0800)
committerZhenyu Wang <zhenyuw@linux.intel.com>
Fri, 16 Dec 2016 08:55:26 +0000 (16:55 +0800)
KVMGT leverages vfio/mdev to mediate device accesses from guest,
this patch adds the vfio/mdev support, thereby completes the
functionality. An intel_vgpu is presented as a mdev device,
and full userspace API compatibility with vfio-pci is kept.
An intel_vgpu_ops is provided to mdev framework, methods get
called to create/remove a vgpu, to open/close it, and to
access it.

Signed-off-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Xiaoguang Chen <xiaoguang.chen@intel.com>
Signed-off-by: Jike Song <jike.song@intel.com>
Signed-off-by: Zhenyu Wang <zhenyuw@linux.intel.com>
drivers/gpu/drm/i915/Kconfig
drivers/gpu/drm/i915/gvt/Makefile
drivers/gpu/drm/i915/gvt/gvt.h
drivers/gpu/drm/i915/gvt/kvmgt.c

index 5ddde7349fbde42a0cb1101f904715e4739ead88..183f5dc1c3f228fbc29df3522cfadff18641ba9b 100644 (file)
@@ -116,6 +116,7 @@ config DRM_I915_GVT_KVMGT
        tristate "Enable KVM/VFIO support for Intel GVT-g"
        depends on DRM_I915_GVT
        depends on KVM
+       depends on VFIO_MDEV && VFIO_MDEV_DEVICE
        default n
        help
          Choose this option if you want to enable KVMGT support for
index 8a46a7f31d536005052953a82a6fdac471042d0c..b123c20e209740d4f3c3af36dd3ca6835972995e 100644 (file)
@@ -5,6 +5,4 @@ GVT_SOURCE := gvt.o aperture_gm.o handlers.o vgpu.o trace_points.o firmware.o \
 
 ccflags-y                              += -I$(src) -I$(src)/$(GVT_DIR) -Wall
 i915-y                                 += $(addprefix $(GVT_DIR)/, $(GVT_SOURCE))
-
-CFLAGS_kvmgt.o                         := -Wno-unused-function
 obj-$(CONFIG_DRM_I915_GVT_KVMGT)       += $(GVT_DIR)/kvmgt.o
index b1a7c8dd4b5fdc38b7970e35d772feba19d49c26..ad0e9364ee703a65ce2a33834321bb4a9fce34f1 100644 (file)
@@ -164,15 +164,17 @@ struct intel_vgpu {
 
 #if IS_ENABLED(CONFIG_DRM_I915_GVT_KVMGT)
        struct {
-               struct device *mdev;
+               struct mdev_device *mdev;
                struct vfio_region *region;
                int num_regions;
                struct eventfd_ctx *intx_trigger;
                struct eventfd_ctx *msi_trigger;
                struct rb_root cache;
                struct mutex cache_lock;
-               void *vfio_group;
                struct notifier_block iommu_notifier;
+               struct notifier_block group_notifier;
+               struct kvm *kvm;
+               struct work_struct release_work;
        } vdev;
 #endif
 };
index 24496ad6a942fb91321fdb4ffa03c0c5b07d3956..4dd6722a733933224c269825d6f12f53525bf9c7 100644 (file)
 #include <linux/uuid.h>
 #include <linux/kvm_host.h>
 #include <linux/vfio.h>
+#include <linux/mdev.h>
 
 #include "i915_drv.h"
 #include "gvt.h"
 
-static inline long kvmgt_pin_pages(struct device *dev, unsigned long *user_pfn,
-                       long npage, int prot, unsigned long *phys_pfn)
-{
-       return 0;
-}
-static inline long kvmgt_unpin_pages(struct device *dev, unsigned long *pfn,
-                       long npage)
-{
-       return 0;
-}
-
 static const struct intel_gvt_ops *intel_gvt_ops;
 
-
 /* helper macros copied from vfio-pci */
 #define VFIO_PCI_OFFSET_SHIFT   40
 #define VFIO_PCI_OFFSET_TO_INDEX(off)   (off >> VFIO_PCI_OFFSET_SHIFT)
@@ -91,6 +80,15 @@ struct gvt_dma {
        kvm_pfn_t pfn;
 };
 
+static inline bool handle_valid(unsigned long handle)
+{
+       return !!(handle & ~0xff);
+}
+
+static int kvmgt_guest_init(struct mdev_device *mdev);
+static void intel_vgpu_release_work(struct work_struct *work);
+static bool kvmgt_guest_exit(struct kvmgt_guest_info *info);
+
 static struct gvt_dma *__gvt_cache_find(struct intel_vgpu *vgpu, gfn_t gfn)
 {
        struct rb_node *node = vgpu->vdev.cache.rb_node;
@@ -168,9 +166,10 @@ static void __gvt_cache_remove_entry(struct intel_vgpu *vgpu,
 
 static void gvt_cache_remove(struct intel_vgpu *vgpu, gfn_t gfn)
 {
-       struct device *dev = vgpu->vdev.mdev;
+       struct device *dev = &vgpu->vdev.mdev->dev;
        struct gvt_dma *this;
-       unsigned long pfn;
+       unsigned long g1;
+       int rc;
 
        mutex_lock(&vgpu->vdev.cache_lock);
        this  = __gvt_cache_find(vgpu, gfn);
@@ -179,8 +178,9 @@ static void gvt_cache_remove(struct intel_vgpu *vgpu, gfn_t gfn)
                return;
        }
 
-       pfn = this->pfn;
-       WARN_ON((kvmgt_unpin_pages(dev, &pfn, 1) != 1));
+       g1 = gfn;
+       rc = vfio_unpin_pages(dev, &g1, 1);
+       WARN_ON(rc != 1);
        __gvt_cache_remove_entry(vgpu, this);
        mutex_unlock(&vgpu->vdev.cache_lock);
 }
@@ -195,15 +195,15 @@ static void gvt_cache_destroy(struct intel_vgpu *vgpu)
 {
        struct gvt_dma *dma;
        struct rb_node *node = NULL;
-       struct device *dev = vgpu->vdev.mdev;
-       unsigned long pfn;
+       struct device *dev = &vgpu->vdev.mdev->dev;
+       unsigned long gfn;
 
        mutex_lock(&vgpu->vdev.cache_lock);
        while ((node = rb_first(&vgpu->vdev.cache))) {
                dma = rb_entry(node, struct gvt_dma, node);
-               pfn = dma->pfn;
+               gfn = dma->gfn;
 
-               kvmgt_unpin_pages(dev, &pfn, 1);
+               vfio_unpin_pages(dev, &gfn, 1);
                __gvt_cache_remove_entry(vgpu, dma);
        }
        mutex_unlock(&vgpu->vdev.cache_lock);
@@ -227,7 +227,53 @@ static struct intel_vgpu_type *intel_gvt_find_vgpu_type(struct intel_gvt *gvt,
        return NULL;
 }
 
+static ssize_t available_instance_show(struct kobject *kobj, struct device *dev,
+               char *buf)
+{
+       struct intel_vgpu_type *type;
+       unsigned int num = 0;
+       void *gvt = kdev_to_i915(dev)->gvt;
+
+       type = intel_gvt_find_vgpu_type(gvt, kobject_name(kobj));
+       if (!type)
+               num = 0;
+       else
+               num = type->avail_instance;
+
+       return sprintf(buf, "%u\n", num);
+}
+
+static ssize_t device_api_show(struct kobject *kobj, struct device *dev,
+               char *buf)
+{
+       return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING);
+}
+
+static ssize_t description_show(struct kobject *kobj, struct device *dev,
+               char *buf)
+{
+       struct intel_vgpu_type *type;
+       void *gvt = kdev_to_i915(dev)->gvt;
+
+       type = intel_gvt_find_vgpu_type(gvt, kobject_name(kobj));
+       if (!type)
+               return 0;
+
+       return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n"
+                               "fence: %d\n",
+                               BYTES_TO_MB(type->low_gm_size),
+                               BYTES_TO_MB(type->high_gm_size),
+                               type->fence);
+}
+
+static MDEV_TYPE_ATTR_RO(available_instance);
+static MDEV_TYPE_ATTR_RO(device_api);
+static MDEV_TYPE_ATTR_RO(description);
+
 static struct attribute *type_attrs[] = {
+       &mdev_type_attr_available_instance.attr,
+       &mdev_type_attr_device_api.attr,
+       &mdev_type_attr_description.attr,
        NULL,
 };
 
@@ -343,6 +389,720 @@ static void kvmgt_protect_table_del(struct kvmgt_guest_info *info,
        }
 }
 
+static int intel_vgpu_create(struct kobject *kobj, struct mdev_device *mdev)
+{
+       struct intel_vgpu *vgpu;
+       struct intel_vgpu_type *type;
+       struct device *pdev;
+       void *gvt;
+
+       pdev = mdev->parent->dev;
+       gvt = kdev_to_i915(pdev)->gvt;
+
+       type = intel_gvt_find_vgpu_type(gvt, kobject_name(kobj));
+       if (!type) {
+               gvt_err("failed to find type %s to create\n",
+                                               kobject_name(kobj));
+               return -EINVAL;
+       }
+
+       vgpu = intel_gvt_ops->vgpu_create(gvt, type);
+       if (IS_ERR_OR_NULL(vgpu)) {
+               gvt_err("create intel vgpu failed\n");
+               return -EINVAL;
+       }
+
+       INIT_WORK(&vgpu->vdev.release_work, intel_vgpu_release_work);
+
+       vgpu->vdev.mdev = mdev;
+       mdev_set_drvdata(mdev, vgpu);
+
+       gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
+                    dev_name(&mdev->dev));
+       return 0;
+}
+
+static int intel_vgpu_remove(struct mdev_device *mdev)
+{
+       struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+
+       if (handle_valid(vgpu->handle))
+               return -EBUSY;
+
+       intel_gvt_ops->vgpu_destroy(vgpu);
+       return 0;
+}
+
+static int intel_vgpu_iommu_notifier(struct notifier_block *nb,
+                                    unsigned long action, void *data)
+{
+       struct intel_vgpu *vgpu = container_of(nb,
+                                       struct intel_vgpu,
+                                       vdev.iommu_notifier);
+
+       if (action == VFIO_IOMMU_NOTIFY_DMA_UNMAP) {
+               struct vfio_iommu_type1_dma_unmap *unmap = data;
+               unsigned long gfn, end_gfn;
+
+               gfn = unmap->iova >> PAGE_SHIFT;
+               end_gfn = gfn + unmap->size / PAGE_SIZE;
+
+               while (gfn < end_gfn)
+                       gvt_cache_remove(vgpu, gfn++);
+       }
+
+       return NOTIFY_OK;
+}
+
+static int intel_vgpu_group_notifier(struct notifier_block *nb,
+                                    unsigned long action, void *data)
+{
+       struct intel_vgpu *vgpu = container_of(nb,
+                                       struct intel_vgpu,
+                                       vdev.group_notifier);
+
+       /* the only action we care about */
+       if (action == VFIO_GROUP_NOTIFY_SET_KVM) {
+               vgpu->vdev.kvm = data;
+
+               if (!data)
+                       schedule_work(&vgpu->vdev.release_work);
+       }
+
+       return NOTIFY_OK;
+}
+
+static int intel_vgpu_open(struct mdev_device *mdev)
+{
+       struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+       unsigned long events;
+       int ret;
+
+       vgpu->vdev.iommu_notifier.notifier_call = intel_vgpu_iommu_notifier;
+       vgpu->vdev.group_notifier.notifier_call = intel_vgpu_group_notifier;
+
+       events = VFIO_IOMMU_NOTIFY_DMA_UNMAP;
+       ret = vfio_register_notifier(&mdev->dev, VFIO_IOMMU_NOTIFY, &events,
+                               &vgpu->vdev.iommu_notifier);
+       if (ret != 0) {
+               gvt_err("vfio_register_notifier for iommu failed: %d\n", ret);
+               goto out;
+       }
+
+       events = VFIO_GROUP_NOTIFY_SET_KVM;
+       ret = vfio_register_notifier(&mdev->dev, VFIO_GROUP_NOTIFY, &events,
+                               &vgpu->vdev.group_notifier);
+       if (ret != 0) {
+               gvt_err("vfio_register_notifier for group failed: %d\n", ret);
+               goto undo_iommu;
+       }
+
+       return kvmgt_guest_init(mdev);
+
+undo_iommu:
+       vfio_unregister_notifier(&mdev->dev, VFIO_IOMMU_NOTIFY,
+                                       &vgpu->vdev.iommu_notifier);
+out:
+       return ret;
+}
+
+static void __intel_vgpu_release(struct intel_vgpu *vgpu)
+{
+       struct kvmgt_guest_info *info;
+
+       if (!handle_valid(vgpu->handle))
+               return;
+
+       vfio_unregister_notifier(&vgpu->vdev.mdev->dev, VFIO_IOMMU_NOTIFY,
+                                       &vgpu->vdev.iommu_notifier);
+       vfio_unregister_notifier(&vgpu->vdev.mdev->dev, VFIO_GROUP_NOTIFY,
+                                       &vgpu->vdev.group_notifier);
+
+       info = (struct kvmgt_guest_info *)vgpu->handle;
+       kvmgt_guest_exit(info);
+       vgpu->handle = 0;
+}
+
+static void intel_vgpu_release(struct mdev_device *mdev)
+{
+       struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+
+       __intel_vgpu_release(vgpu);
+}
+
+static void intel_vgpu_release_work(struct work_struct *work)
+{
+       struct intel_vgpu *vgpu = container_of(work, struct intel_vgpu,
+                                       vdev.release_work);
+       __intel_vgpu_release(vgpu);
+}
+
+static uint64_t intel_vgpu_get_bar0_addr(struct intel_vgpu *vgpu)
+{
+       u32 start_lo, start_hi;
+       u32 mem_type;
+       int pos = PCI_BASE_ADDRESS_0;
+
+       start_lo = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + pos)) &
+                       PCI_BASE_ADDRESS_MEM_MASK;
+       mem_type = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space + pos)) &
+                       PCI_BASE_ADDRESS_MEM_TYPE_MASK;
+
+       switch (mem_type) {
+       case PCI_BASE_ADDRESS_MEM_TYPE_64:
+               start_hi = (*(u32 *)(vgpu->cfg_space.virtual_cfg_space
+                                               + pos + 4));
+               break;
+       case PCI_BASE_ADDRESS_MEM_TYPE_32:
+       case PCI_BASE_ADDRESS_MEM_TYPE_1M:
+               /* 1M mem BAR treated as 32-bit BAR */
+       default:
+               /* mem unknown type treated as 32-bit BAR */
+               start_hi = 0;
+               break;
+       }
+
+       return ((u64)start_hi << 32) | start_lo;
+}
+
+static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf,
+                       size_t count, loff_t *ppos, bool is_write)
+{
+       struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+       unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+       uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+       int ret = -EINVAL;
+
+
+       if (index >= VFIO_PCI_NUM_REGIONS) {
+               gvt_err("invalid index: %u\n", index);
+               return -EINVAL;
+       }
+
+       switch (index) {
+       case VFIO_PCI_CONFIG_REGION_INDEX:
+               if (is_write)
+                       ret = intel_gvt_ops->emulate_cfg_write(vgpu, pos,
+                                               buf, count);
+               else
+                       ret = intel_gvt_ops->emulate_cfg_read(vgpu, pos,
+                                               buf, count);
+               break;
+       case VFIO_PCI_BAR0_REGION_INDEX:
+       case VFIO_PCI_BAR1_REGION_INDEX:
+               if (is_write) {
+                       uint64_t bar0_start = intel_vgpu_get_bar0_addr(vgpu);
+
+                       ret = intel_gvt_ops->emulate_mmio_write(vgpu,
+                                               bar0_start + pos, buf, count);
+               } else {
+                       uint64_t bar0_start = intel_vgpu_get_bar0_addr(vgpu);
+
+                       ret = intel_gvt_ops->emulate_mmio_read(vgpu,
+                                               bar0_start + pos, buf, count);
+               }
+               break;
+       case VFIO_PCI_BAR2_REGION_INDEX:
+       case VFIO_PCI_BAR3_REGION_INDEX:
+       case VFIO_PCI_BAR4_REGION_INDEX:
+       case VFIO_PCI_BAR5_REGION_INDEX:
+       case VFIO_PCI_VGA_REGION_INDEX:
+       case VFIO_PCI_ROM_REGION_INDEX:
+       default:
+               gvt_err("unsupported region: %u\n", index);
+       }
+
+       return ret == 0 ? count : ret;
+}
+
+static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf,
+                       size_t count, loff_t *ppos)
+{
+       unsigned int done = 0;
+       int ret;
+
+       while (count) {
+               size_t filled;
+
+               if (count >= 4 && !(*ppos % 4)) {
+                       u32 val;
+
+                       ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+                                       ppos, false);
+                       if (ret <= 0)
+                               goto read_err;
+
+                       if (copy_to_user(buf, &val, sizeof(val)))
+                               goto read_err;
+
+                       filled = 4;
+               } else if (count >= 2 && !(*ppos % 2)) {
+                       u16 val;
+
+                       ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+                                       ppos, false);
+                       if (ret <= 0)
+                               goto read_err;
+
+                       if (copy_to_user(buf, &val, sizeof(val)))
+                               goto read_err;
+
+                       filled = 2;
+               } else {
+                       u8 val;
+
+                       ret = intel_vgpu_rw(mdev, &val, sizeof(val), ppos,
+                                       false);
+                       if (ret <= 0)
+                               goto read_err;
+
+                       if (copy_to_user(buf, &val, sizeof(val)))
+                               goto read_err;
+
+                       filled = 1;
+               }
+
+               count -= filled;
+               done += filled;
+               *ppos += filled;
+               buf += filled;
+       }
+
+       return done;
+
+read_err:
+       return -EFAULT;
+}
+
+static ssize_t intel_vgpu_write(struct mdev_device *mdev,
+                               const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       unsigned int done = 0;
+       int ret;
+
+       while (count) {
+               size_t filled;
+
+               if (count >= 4 && !(*ppos % 4)) {
+                       u32 val;
+
+                       if (copy_from_user(&val, buf, sizeof(val)))
+                               goto write_err;
+
+                       ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val),
+                                       ppos, true);
+                       if (ret <= 0)
+                               goto write_err;
+
+                       filled = 4;
+               } else if (count >= 2 && !(*ppos % 2)) {
+                       u16 val;
+
+                       if (copy_from_user(&val, buf, sizeof(val)))
+                               goto write_err;
+
+                       ret = intel_vgpu_rw(mdev, (char *)&val,
+                                       sizeof(val), ppos, true);
+                       if (ret <= 0)
+                               goto write_err;
+
+                       filled = 2;
+               } else {
+                       u8 val;
+
+                       if (copy_from_user(&val, buf, sizeof(val)))
+                               goto write_err;
+
+                       ret = intel_vgpu_rw(mdev, &val, sizeof(val),
+                                       ppos, true);
+                       if (ret <= 0)
+                               goto write_err;
+
+                       filled = 1;
+               }
+
+               count -= filled;
+               done += filled;
+               *ppos += filled;
+               buf += filled;
+       }
+
+       return done;
+write_err:
+       return -EFAULT;
+}
+
+static int intel_vgpu_mmap(struct mdev_device *mdev, struct vm_area_struct *vma)
+{
+       unsigned int index;
+       u64 virtaddr;
+       unsigned long req_size, pgoff = 0;
+       pgprot_t pg_prot;
+       struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+
+       index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+       if (index >= VFIO_PCI_ROM_REGION_INDEX)
+               return -EINVAL;
+
+       if (vma->vm_end < vma->vm_start)
+               return -EINVAL;
+       if ((vma->vm_flags & VM_SHARED) == 0)
+               return -EINVAL;
+       if (index != VFIO_PCI_BAR2_REGION_INDEX)
+               return -EINVAL;
+
+       pg_prot = vma->vm_page_prot;
+       virtaddr = vma->vm_start;
+       req_size = vma->vm_end - vma->vm_start;
+       pgoff = vgpu_aperture_pa_base(vgpu) >> PAGE_SHIFT;
+
+       return remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
+}
+
+static int intel_vgpu_get_irq_count(struct intel_vgpu *vgpu, int type)
+{
+       if (type == VFIO_PCI_INTX_IRQ_INDEX || type == VFIO_PCI_MSI_IRQ_INDEX)
+               return 1;
+
+       return 0;
+}
+
+static int intel_vgpu_set_intx_mask(struct intel_vgpu *vgpu,
+                       unsigned int index, unsigned int start,
+                       unsigned int count, uint32_t flags,
+                       void *data)
+{
+       return 0;
+}
+
+static int intel_vgpu_set_intx_unmask(struct intel_vgpu *vgpu,
+                       unsigned int index, unsigned int start,
+                       unsigned int count, uint32_t flags, void *data)
+{
+       return 0;
+}
+
+static int intel_vgpu_set_intx_trigger(struct intel_vgpu *vgpu,
+               unsigned int index, unsigned int start, unsigned int count,
+               uint32_t flags, void *data)
+{
+       return 0;
+}
+
+static int intel_vgpu_set_msi_trigger(struct intel_vgpu *vgpu,
+               unsigned int index, unsigned int start, unsigned int count,
+               uint32_t flags, void *data)
+{
+       struct eventfd_ctx *trigger;
+
+       if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+               int fd = *(int *)data;
+
+               trigger = eventfd_ctx_fdget(fd);
+               if (IS_ERR(trigger)) {
+                       gvt_err("eventfd_ctx_fdget failed\n");
+                       return PTR_ERR(trigger);
+               }
+               vgpu->vdev.msi_trigger = trigger;
+       }
+
+       return 0;
+}
+
+static int intel_vgpu_set_irqs(struct intel_vgpu *vgpu, uint32_t flags,
+               unsigned int index, unsigned int start, unsigned int count,
+               void *data)
+{
+       int (*func)(struct intel_vgpu *vgpu, unsigned int index,
+                       unsigned int start, unsigned int count, uint32_t flags,
+                       void *data) = NULL;
+
+       switch (index) {
+       case VFIO_PCI_INTX_IRQ_INDEX:
+               switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+               case VFIO_IRQ_SET_ACTION_MASK:
+                       func = intel_vgpu_set_intx_mask;
+                       break;
+               case VFIO_IRQ_SET_ACTION_UNMASK:
+                       func = intel_vgpu_set_intx_unmask;
+                       break;
+               case VFIO_IRQ_SET_ACTION_TRIGGER:
+                       func = intel_vgpu_set_intx_trigger;
+                       break;
+               }
+               break;
+       case VFIO_PCI_MSI_IRQ_INDEX:
+               switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+               case VFIO_IRQ_SET_ACTION_MASK:
+               case VFIO_IRQ_SET_ACTION_UNMASK:
+                       /* XXX Need masking support exported */
+                       break;
+               case VFIO_IRQ_SET_ACTION_TRIGGER:
+                       func = intel_vgpu_set_msi_trigger;
+                       break;
+               }
+               break;
+       }
+
+       if (!func)
+               return -ENOTTY;
+
+       return func(vgpu, index, start, count, flags, data);
+}
+
+static long intel_vgpu_ioctl(struct mdev_device *mdev, unsigned int cmd,
+                            unsigned long arg)
+{
+       struct intel_vgpu *vgpu = mdev_get_drvdata(mdev);
+       unsigned long minsz;
+
+       gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu->id, cmd);
+
+       if (cmd == VFIO_DEVICE_GET_INFO) {
+               struct vfio_device_info info;
+
+               minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+               if (copy_from_user(&info, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (info.argsz < minsz)
+                       return -EINVAL;
+
+               info.flags = VFIO_DEVICE_FLAGS_PCI;
+               info.flags |= VFIO_DEVICE_FLAGS_RESET;
+               info.num_regions = VFIO_PCI_NUM_REGIONS;
+               info.num_irqs = VFIO_PCI_NUM_IRQS;
+
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
+
+       } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+               struct vfio_region_info info;
+               struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+               int i, ret;
+               struct vfio_region_info_cap_sparse_mmap *sparse = NULL;
+               size_t size;
+               int nr_areas = 1;
+               int cap_type_id;
+
+               minsz = offsetofend(struct vfio_region_info, offset);
+
+               if (copy_from_user(&info, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (info.argsz < minsz)
+                       return -EINVAL;
+
+               switch (info.index) {
+               case VFIO_PCI_CONFIG_REGION_INDEX:
+                       info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+                       info.size = INTEL_GVT_MAX_CFG_SPACE_SZ;
+                       info.flags = VFIO_REGION_INFO_FLAG_READ |
+                                    VFIO_REGION_INFO_FLAG_WRITE;
+                       break;
+               case VFIO_PCI_BAR0_REGION_INDEX:
+                       info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+                       info.size = vgpu->cfg_space.bar[info.index].size;
+                       if (!info.size) {
+                               info.flags = 0;
+                               break;
+                       }
+
+                       info.flags = VFIO_REGION_INFO_FLAG_READ |
+                                    VFIO_REGION_INFO_FLAG_WRITE;
+                       break;
+               case VFIO_PCI_BAR1_REGION_INDEX:
+                       info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+                       info.size = 0;
+                       info.flags = 0;
+                       break;
+               case VFIO_PCI_BAR2_REGION_INDEX:
+                       info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+                       info.flags = VFIO_REGION_INFO_FLAG_CAPS |
+                                       VFIO_REGION_INFO_FLAG_MMAP |
+                                       VFIO_REGION_INFO_FLAG_READ |
+                                       VFIO_REGION_INFO_FLAG_WRITE;
+                       info.size = gvt_aperture_sz(vgpu->gvt);
+
+                       size = sizeof(*sparse) +
+                                       (nr_areas * sizeof(*sparse->areas));
+                       sparse = kzalloc(size, GFP_KERNEL);
+                       if (!sparse)
+                               return -ENOMEM;
+
+                       sparse->nr_areas = nr_areas;
+                       cap_type_id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+                       sparse->areas[0].offset =
+                                       PAGE_ALIGN(vgpu_aperture_offset(vgpu));
+                       sparse->areas[0].size = vgpu_aperture_sz(vgpu);
+                       if (!caps.buf) {
+                               kfree(caps.buf);
+                               caps.buf = NULL;
+                               caps.size = 0;
+                       }
+                       break;
+
+               case VFIO_PCI_BAR3_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+                       info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+                       info.size = 0;
+
+                       info.flags = 0;
+                       gvt_dbg_core("get region info bar:%d\n", info.index);
+                       break;
+
+               case VFIO_PCI_ROM_REGION_INDEX:
+               case VFIO_PCI_VGA_REGION_INDEX:
+                       gvt_dbg_core("get region info index:%d\n", info.index);
+                       break;
+               default:
+                       {
+                               struct vfio_region_info_cap_type cap_type;
+
+                               if (info.index >= VFIO_PCI_NUM_REGIONS +
+                                               vgpu->vdev.num_regions)
+                                       return -EINVAL;
+
+                               i = info.index - VFIO_PCI_NUM_REGIONS;
+
+                               info.offset =
+                                       VFIO_PCI_INDEX_TO_OFFSET(info.index);
+                               info.size = vgpu->vdev.region[i].size;
+                               info.flags = vgpu->vdev.region[i].flags;
+
+                               cap_type.type = vgpu->vdev.region[i].type;
+                               cap_type.subtype = vgpu->vdev.region[i].subtype;
+
+                               ret = vfio_info_add_capability(&caps,
+                                               VFIO_REGION_INFO_CAP_TYPE,
+                                               &cap_type);
+                               if (ret)
+                                       return ret;
+                       }
+               }
+
+               if ((info.flags & VFIO_REGION_INFO_FLAG_CAPS) && sparse) {
+                       switch (cap_type_id) {
+                       case VFIO_REGION_INFO_CAP_SPARSE_MMAP:
+                               ret = vfio_info_add_capability(&caps,
+                                       VFIO_REGION_INFO_CAP_SPARSE_MMAP,
+                                       sparse);
+                               kfree(sparse);
+                               if (ret)
+                                       return ret;
+                               break;
+                       default:
+                               return -EINVAL;
+                       }
+               }
+
+               if (caps.size) {
+                       if (info.argsz < sizeof(info) + caps.size) {
+                               info.argsz = sizeof(info) + caps.size;
+                               info.cap_offset = 0;
+                       } else {
+                               vfio_info_cap_shift(&caps, sizeof(info));
+                               if (copy_to_user((void __user *)arg +
+                                                 sizeof(info), caps.buf,
+                                                 caps.size)) {
+                                       kfree(caps.buf);
+                                       return -EFAULT;
+                               }
+                               info.cap_offset = sizeof(info);
+                       }
+
+                       kfree(caps.buf);
+               }
+
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
+       } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+               struct vfio_irq_info info;
+
+               minsz = offsetofend(struct vfio_irq_info, count);
+
+               if (copy_from_user(&info, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+                       return -EINVAL;
+
+               switch (info.index) {
+               case VFIO_PCI_INTX_IRQ_INDEX:
+               case VFIO_PCI_MSI_IRQ_INDEX:
+                       break;
+               default:
+                       return -EINVAL;
+               }
+
+               info.flags = VFIO_IRQ_INFO_EVENTFD;
+
+               info.count = intel_vgpu_get_irq_count(vgpu, info.index);
+
+               if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+                       info.flags |= (VFIO_IRQ_INFO_MASKABLE |
+                                      VFIO_IRQ_INFO_AUTOMASKED);
+               else
+                       info.flags |= VFIO_IRQ_INFO_NORESIZE;
+
+               return copy_to_user((void __user *)arg, &info, minsz) ?
+                       -EFAULT : 0;
+       } else if (cmd == VFIO_DEVICE_SET_IRQS) {
+               struct vfio_irq_set hdr;
+               u8 *data = NULL;
+               int ret = 0;
+               size_t data_size = 0;
+
+               minsz = offsetofend(struct vfio_irq_set, count);
+
+               if (copy_from_user(&hdr, (void __user *)arg, minsz))
+                       return -EFAULT;
+
+               if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+                       int max = intel_vgpu_get_irq_count(vgpu, hdr.index);
+
+                       ret = vfio_set_irqs_validate_and_prepare(&hdr, max,
+                                               VFIO_PCI_NUM_IRQS, &data_size);
+                       if (ret) {
+                               gvt_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
+                               return -EINVAL;
+                       }
+                       if (data_size) {
+                               data = memdup_user((void __user *)(arg + minsz),
+                                                  data_size);
+                               if (IS_ERR(data))
+                                       return PTR_ERR(data);
+                       }
+               }
+
+               ret = intel_vgpu_set_irqs(vgpu, hdr.flags, hdr.index,
+                                       hdr.start, hdr.count, data);
+               kfree(data);
+
+               return ret;
+       } else if (cmd == VFIO_DEVICE_RESET) {
+               intel_gvt_ops->vgpu_reset(vgpu);
+               return 0;
+       }
+
+       return 0;
+}
+
+static const struct parent_ops intel_vgpu_ops = {
+       .supported_type_groups  = intel_vgpu_type_groups,
+       .create                 = intel_vgpu_create,
+       .remove                 = intel_vgpu_remove,
+
+       .open                   = intel_vgpu_open,
+       .release                = intel_vgpu_release,
+
+       .read                   = intel_vgpu_read,
+       .write                  = intel_vgpu_write,
+       .mmap                   = intel_vgpu_mmap,
+       .ioctl                  = intel_vgpu_ioctl,
+};
+
 static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
 {
        if (!intel_gvt_init_vgpu_type_groups(gvt))
@@ -350,22 +1110,28 @@ static int kvmgt_host_init(struct device *dev, void *gvt, const void *ops)
 
        intel_gvt_ops = ops;
 
-       /* MDEV is not yet available */
-       return -ENODEV;
+       return mdev_register_device(dev, &intel_vgpu_ops);
 }
 
 static void kvmgt_host_exit(struct device *dev, void *gvt)
 {
        intel_gvt_cleanup_vgpu_type_groups(gvt);
+       mdev_unregister_device(dev);
 }
 
 static int kvmgt_write_protect_add(unsigned long handle, u64 gfn)
 {
-       struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
-       struct kvm *kvm = info->kvm;
+       struct kvmgt_guest_info *info;
+       struct kvm *kvm;
        struct kvm_memory_slot *slot;
        int idx;
 
+       if (!handle_valid(handle))
+               return -ESRCH;
+
+       info = (struct kvmgt_guest_info *)handle;
+       kvm = info->kvm;
+
        idx = srcu_read_lock(&kvm->srcu);
        slot = gfn_to_memslot(kvm, gfn);
 
@@ -385,11 +1151,17 @@ out:
 
 static int kvmgt_write_protect_remove(unsigned long handle, u64 gfn)
 {
-       struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
-       struct kvm *kvm = info->kvm;
+       struct kvmgt_guest_info *info;
+       struct kvm *kvm;
        struct kvm_memory_slot *slot;
        int idx;
 
+       if (!handle_valid(handle))
+               return 0;
+
+       info = (struct kvmgt_guest_info *)handle;
+       kvm = info->kvm;
+
        idx = srcu_read_lock(&kvm->srcu);
        slot = gfn_to_memslot(kvm, gfn);
 
@@ -477,6 +1249,85 @@ static int kvmgt_detect_host(void)
        return kvmgt_check_guest() ? -ENODEV : 0;
 }
 
+static bool __kvmgt_vgpu_exist(struct intel_vgpu *vgpu, struct kvm *kvm)
+{
+       struct intel_vgpu *itr;
+       struct kvmgt_guest_info *info;
+       int id;
+       bool ret = false;
+
+       mutex_lock(&vgpu->gvt->lock);
+       for_each_active_vgpu(vgpu->gvt, itr, id) {
+               if (!handle_valid(itr->handle))
+                       continue;
+
+               info = (struct kvmgt_guest_info *)itr->handle;
+               if (kvm && kvm == info->kvm) {
+                       ret = true;
+                       goto out;
+               }
+       }
+out:
+       mutex_unlock(&vgpu->gvt->lock);
+       return ret;
+}
+
+static int kvmgt_guest_init(struct mdev_device *mdev)
+{
+       struct kvmgt_guest_info *info;
+       struct intel_vgpu *vgpu;
+       struct kvm *kvm;
+
+       vgpu = mdev_get_drvdata(mdev);
+       if (handle_valid(vgpu->handle))
+               return -EEXIST;
+
+       kvm = vgpu->vdev.kvm;
+       if (!kvm || kvm->mm != current->mm) {
+               gvt_err("KVM is required to use Intel vGPU\n");
+               return -ESRCH;
+       }
+
+       if (__kvmgt_vgpu_exist(vgpu, kvm))
+               return -EEXIST;
+
+       info = vzalloc(sizeof(struct kvmgt_guest_info));
+       if (!info)
+               return -ENOMEM;
+
+       vgpu->handle = (unsigned long)info;
+       info->vgpu = vgpu;
+       info->kvm = kvm;
+
+       kvmgt_protect_table_init(info);
+       gvt_cache_init(vgpu);
+
+       info->track_node.track_write = kvmgt_page_track_write;
+       info->track_node.track_flush_slot = kvmgt_page_track_flush_slot;
+       kvm_page_track_register_notifier(kvm, &info->track_node);
+
+       return 0;
+}
+
+static bool kvmgt_guest_exit(struct kvmgt_guest_info *info)
+{
+       struct intel_vgpu *vgpu;
+
+       if (!info) {
+               gvt_err("kvmgt_guest_info invalid\n");
+               return false;
+       }
+
+       vgpu = info->vgpu;
+
+       kvm_page_track_unregister_notifier(info->kvm, &info->track_node);
+       kvmgt_protect_table_destroy(info);
+       gvt_cache_destroy(vgpu);
+       vfree(info);
+
+       return true;
+}
+
 static int kvmgt_attach_vgpu(void *vgpu, unsigned long *handle)
 {
        /* nothing to do here */
@@ -490,30 +1341,42 @@ static void kvmgt_detach_vgpu(unsigned long handle)
 
 static int kvmgt_inject_msi(unsigned long handle, u32 addr, u16 data)
 {
-       struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
-       struct intel_vgpu *vgpu = info->vgpu;
+       struct kvmgt_guest_info *info;
+       struct intel_vgpu *vgpu;
 
-       if (vgpu->vdev.msi_trigger)
-               return eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1;
+       if (!handle_valid(handle))
+               return -ESRCH;
 
-       return false;
+       info = (struct kvmgt_guest_info *)handle;
+       vgpu = info->vgpu;
+
+       if (eventfd_signal(vgpu->vdev.msi_trigger, 1) == 1)
+               return 0;
+
+       return -EFAULT;
 }
 
 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle, unsigned long gfn)
 {
        unsigned long pfn;
-       struct kvmgt_guest_info *info = (struct kvmgt_guest_info *)handle;
+       struct kvmgt_guest_info *info;
+       struct device *dev;
        int rc;
 
+       if (!handle_valid(handle))
+               return INTEL_GVT_INVALID_ADDR;
+
+       info = (struct kvmgt_guest_info *)handle;
        pfn = gvt_cache_find(info->vgpu, gfn);
        if (pfn != 0)
                return pfn;
 
-       rc = kvmgt_pin_pages(info->vgpu->vdev.mdev, &gfn, 1,
-                            IOMMU_READ | IOMMU_WRITE, &pfn);
+       pfn = INTEL_GVT_INVALID_ADDR;
+       dev = &info->vgpu->vdev.mdev->dev;
+       rc = vfio_pin_pages(dev, &gfn, 1, IOMMU_READ | IOMMU_WRITE, &pfn);
        if (rc != 1) {
-               gvt_err("vfio_pin_pages failed for gfn: 0x%lx\n", gfn);
-               return 0;
+               gvt_err("vfio_pin_pages failed for gfn 0x%lx: %d\n", gfn, rc);
+               return INTEL_GVT_INVALID_ADDR;
        }
 
        gvt_cache_add(info->vgpu, gfn, pfn);
@@ -528,6 +1391,9 @@ static int kvmgt_rw_gpa(unsigned long handle, unsigned long gpa,
        int ret;
        bool kthread = current->mm == NULL;
 
+       if (!handle_valid(handle))
+               return -ESRCH;
+
        info = (struct kvmgt_guest_info *)handle;
        kvm = info->kvm;