IB/hfi1: Fix an Oops on pci device force remove
authorTadeusz Struk <tadeusz.struk@intel.com>
Tue, 25 Oct 2016 15:57:55 +0000 (08:57 -0700)
committerDoug Ledford <dledford@redhat.com>
Tue, 15 Nov 2016 21:16:41 +0000 (16:16 -0500)
This patch fixes an Oops on device unbind, when the device is used
by a PSM user process. PSM processes access device resources which
are freed on device removal. Similar protection exists in uverbs
in ib_core for Verbs clients, but PSM doesn't use ib_uverbs hence
a separate protection is required for PSM clients.

Cc: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
Reviewed-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Dean Luick <dean.luick@intel.com>
Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
drivers/infiniband/hw/hfi1/chip.c
drivers/infiniband/hw/hfi1/file_ops.c
drivers/infiniband/hw/hfi1/hfi.h
drivers/infiniband/hw/hfi1/init.c

index 9bf5f23544d4cf529e67adcd9838fcc1692260bd..799215255b4966842d464b9250816f2fd5e5c991 100644 (file)
@@ -14691,6 +14691,11 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev,
        if (ret)
                goto bail_free_cntrs;
 
+       init_completion(&dd->user_comp);
+
+       /* The user refcount starts with one to inidicate an active device */
+       atomic_set(&dd->user_refcount, 1);
+
        goto bail;
 
 bail_free_rcverr:
index 677efa0e8cd689f167ab72bc13eb2b60dd91b8ed..bd786b7bd30b1256f523a89357e52d12f2a4266c 100644 (file)
@@ -172,6 +172,9 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
                                               struct hfi1_devdata,
                                               user_cdev);
 
+       if (!atomic_inc_not_zero(&dd->user_refcount))
+               return -ENXIO;
+
        /* Just take a ref now. Not all opens result in a context assign */
        kobject_get(&dd->kobj);
 
@@ -183,11 +186,17 @@ static int hfi1_file_open(struct inode *inode, struct file *fp)
                fd->rec_cpu_num = -1; /* no cpu affinity by default */
                fd->mm = current->mm;
                atomic_inc(&fd->mm->mm_count);
-       }
+               fp->private_data = fd;
+       } else {
+               fp->private_data = NULL;
+
+               if (atomic_dec_and_test(&dd->user_refcount))
+                       complete(&dd->user_comp);
 
-       fp->private_data = fd;
+               return -ENOMEM;
+       }
 
-       return fd ? 0 : -ENOMEM;
+       return 0;
 }
 
 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
@@ -798,6 +807,10 @@ static int hfi1_file_close(struct inode *inode, struct file *fp)
 done:
        mmdrop(fdata->mm);
        kobject_put(&dd->kobj);
+
+       if (atomic_dec_and_test(&dd->user_refcount))
+               complete(&dd->user_comp);
+
        kfree(fdata);
        return 0;
 }
index 3c06d204bafd3d9a760ed131a2479db90599f770..368e96c109a50112c84dfe9213bf5acfa96daf22 100644 (file)
@@ -1174,6 +1174,10 @@ struct hfi1_devdata {
        spinlock_t aspm_lock;
        /* Number of verbs contexts which have disabled ASPM */
        atomic_t aspm_disabled_cnt;
+       /* Keeps track of user space clients */
+       atomic_t user_refcount;
+       /* Used to wait for outstanding user space clients before dev removal */
+       struct completion user_comp;
 
        struct hfi1_affinity *affinity;
        struct rhashtable sdma_rht;
index 60db61536fedf396c7a76dcdf777f7000a189eec..e28a6b633ea9930dc5ae91937e6a882a890789ae 100644 (file)
@@ -1538,12 +1538,31 @@ bail:
        return ret;
 }
 
+static void wait_for_clients(struct hfi1_devdata *dd)
+{
+       /*
+        * Remove the device init value and complete the device if there is
+        * no clients or wait for active clients to finish.
+        */
+       if (atomic_dec_and_test(&dd->user_refcount))
+               complete(&dd->user_comp);
+
+       wait_for_completion(&dd->user_comp);
+}
+
 static void remove_one(struct pci_dev *pdev)
 {
        struct hfi1_devdata *dd = pci_get_drvdata(pdev);
 
        /* close debugfs files before ib unregister */
        hfi1_dbg_ibdev_exit(&dd->verbs_dev);
+
+       /* remove the /dev hfi1 interface */
+       hfi1_device_remove(dd);
+
+       /* wait for existing user space clients to finish */
+       wait_for_clients(dd);
+
        /* unregister from IB core */
        hfi1_unregister_ib_device(dd);
 
@@ -1558,8 +1577,6 @@ static void remove_one(struct pci_dev *pdev)
        /* wait until all of our (qsfp) queue_work() calls complete */
        flush_workqueue(ib_wq);
 
-       hfi1_device_remove(dd);
-
        postinit_cleanup(dd);
 }