2 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
3 * Author: Alex Williamson <alex.williamson@redhat.com>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 as
7 * published by the Free Software Foundation.
9 * Derived from original vfio:
10 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
11 * Author: Tom Lyon, pugs@cisco.com
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
28 #include "vfio_pci_private.h"
30 #define DRIVER_VERSION "0.2"
31 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
32 #define DRIVER_DESC "VFIO PCI - User Level meta-driver"
34 static bool nointxmask
;
35 module_param_named(nointxmask
, nointxmask
, bool, S_IRUGO
| S_IWUSR
);
36 MODULE_PARM_DESC(nointxmask
,
37 "Disable support for PCI 2.3 style INTx masking. If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
39 static int vfio_pci_enable(struct vfio_pci_device
*vdev
)
41 struct pci_dev
*pdev
= vdev
->pdev
;
46 vdev
->reset_works
= (pci_reset_function(pdev
) == 0);
48 vdev
->pci_saved_state
= pci_store_saved_state(pdev
);
49 if (!vdev
->pci_saved_state
)
50 pr_debug("%s: Couldn't store %s saved state\n",
51 __func__
, dev_name(&pdev
->dev
));
53 ret
= vfio_config_init(vdev
);
57 if (likely(!nointxmask
))
58 vdev
->pci_2_3
= pci_intx_mask_supported(pdev
);
60 pci_read_config_word(pdev
, PCI_COMMAND
, &cmd
);
61 if (vdev
->pci_2_3
&& (cmd
& PCI_COMMAND_INTX_DISABLE
)) {
62 cmd
&= ~PCI_COMMAND_INTX_DISABLE
;
63 pci_write_config_word(pdev
, PCI_COMMAND
, cmd
);
66 msix_pos
= pci_find_capability(pdev
, PCI_CAP_ID_MSIX
);
71 pci_read_config_word(pdev
, msix_pos
+ PCI_MSIX_FLAGS
, &flags
);
72 pci_read_config_dword(pdev
, msix_pos
+ PCI_MSIX_TABLE
, &table
);
74 vdev
->msix_bar
= table
& PCI_MSIX_FLAGS_BIRMASK
;
75 vdev
->msix_offset
= table
& ~PCI_MSIX_FLAGS_BIRMASK
;
76 vdev
->msix_size
= ((flags
& PCI_MSIX_FLAGS_QSIZE
) + 1) * 16;
78 vdev
->msix_bar
= 0xFF;
80 ret
= pci_enable_device(pdev
);
87 kfree(vdev
->pci_saved_state
);
88 vdev
->pci_saved_state
= NULL
;
89 vfio_config_free(vdev
);
93 static void vfio_pci_disable(struct vfio_pci_device
*vdev
)
97 pci_disable_device(vdev
->pdev
);
99 vfio_pci_set_irqs_ioctl(vdev
, VFIO_IRQ_SET_DATA_NONE
|
100 VFIO_IRQ_SET_ACTION_TRIGGER
,
101 vdev
->irq_type
, 0, 0, NULL
);
103 vdev
->virq_disabled
= false;
105 vfio_config_free(vdev
);
107 pci_reset_function(vdev
->pdev
);
109 if (pci_load_and_free_saved_state(vdev
->pdev
,
110 &vdev
->pci_saved_state
) == 0)
111 pci_restore_state(vdev
->pdev
);
113 pr_info("%s: Couldn't reload %s saved state\n",
114 __func__
, dev_name(&vdev
->pdev
->dev
));
116 for (bar
= PCI_STD_RESOURCES
; bar
<= PCI_STD_RESOURCE_END
; bar
++) {
117 if (!vdev
->barmap
[bar
])
119 pci_iounmap(vdev
->pdev
, vdev
->barmap
[bar
]);
120 pci_release_selected_regions(vdev
->pdev
, 1 << bar
);
121 vdev
->barmap
[bar
] = NULL
;
125 static void vfio_pci_release(void *device_data
)
127 struct vfio_pci_device
*vdev
= device_data
;
129 if (atomic_dec_and_test(&vdev
->refcnt
))
130 vfio_pci_disable(vdev
);
132 module_put(THIS_MODULE
);
135 static int vfio_pci_open(void *device_data
)
137 struct vfio_pci_device
*vdev
= device_data
;
139 if (!try_module_get(THIS_MODULE
))
142 if (atomic_inc_return(&vdev
->refcnt
) == 1) {
143 int ret
= vfio_pci_enable(vdev
);
145 module_put(THIS_MODULE
);
153 static int vfio_pci_get_irq_count(struct vfio_pci_device
*vdev
, int irq_type
)
155 if (irq_type
== VFIO_PCI_INTX_IRQ_INDEX
) {
157 pci_read_config_byte(vdev
->pdev
, PCI_INTERRUPT_PIN
, &pin
);
161 } else if (irq_type
== VFIO_PCI_MSI_IRQ_INDEX
) {
165 pos
= pci_find_capability(vdev
->pdev
, PCI_CAP_ID_MSI
);
167 pci_read_config_word(vdev
->pdev
,
168 pos
+ PCI_MSI_FLAGS
, &flags
);
170 return 1 << (flags
& PCI_MSI_FLAGS_QMASK
);
172 } else if (irq_type
== VFIO_PCI_MSIX_IRQ_INDEX
) {
176 pos
= pci_find_capability(vdev
->pdev
, PCI_CAP_ID_MSIX
);
178 pci_read_config_word(vdev
->pdev
,
179 pos
+ PCI_MSIX_FLAGS
, &flags
);
181 return (flags
& PCI_MSIX_FLAGS_QSIZE
) + 1;
188 static long vfio_pci_ioctl(void *device_data
,
189 unsigned int cmd
, unsigned long arg
)
191 struct vfio_pci_device
*vdev
= device_data
;
194 if (cmd
== VFIO_DEVICE_GET_INFO
) {
195 struct vfio_device_info info
;
197 minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
199 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
202 if (info
.argsz
< minsz
)
205 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
207 if (vdev
->reset_works
)
208 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
210 info
.num_regions
= VFIO_PCI_NUM_REGIONS
;
211 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
213 return copy_to_user((void __user
*)arg
, &info
, minsz
);
215 } else if (cmd
== VFIO_DEVICE_GET_REGION_INFO
) {
216 struct pci_dev
*pdev
= vdev
->pdev
;
217 struct vfio_region_info info
;
219 minsz
= offsetofend(struct vfio_region_info
, offset
);
221 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
224 if (info
.argsz
< minsz
)
227 switch (info
.index
) {
228 case VFIO_PCI_CONFIG_REGION_INDEX
:
229 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
230 info
.size
= pdev
->cfg_size
;
231 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
232 VFIO_REGION_INFO_FLAG_WRITE
;
234 case VFIO_PCI_BAR0_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
235 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
236 info
.size
= pci_resource_len(pdev
, info
.index
);
242 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
243 VFIO_REGION_INFO_FLAG_WRITE
;
244 if (pci_resource_flags(pdev
, info
.index
) &
245 IORESOURCE_MEM
&& info
.size
>= PAGE_SIZE
)
246 info
.flags
|= VFIO_REGION_INFO_FLAG_MMAP
;
248 case VFIO_PCI_ROM_REGION_INDEX
:
253 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
256 /* Report the BAR size, not the ROM size */
257 info
.size
= pci_resource_len(pdev
, info
.index
);
261 /* Is it really there? */
262 io
= pci_map_rom(pdev
, &size
);
267 pci_unmap_rom(pdev
, io
);
269 info
.flags
= VFIO_REGION_INFO_FLAG_READ
;
276 return copy_to_user((void __user
*)arg
, &info
, minsz
);
278 } else if (cmd
== VFIO_DEVICE_GET_IRQ_INFO
) {
279 struct vfio_irq_info info
;
281 minsz
= offsetofend(struct vfio_irq_info
, count
);
283 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
286 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
289 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
291 info
.count
= vfio_pci_get_irq_count(vdev
, info
.index
);
293 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
294 info
.flags
|= (VFIO_IRQ_INFO_MASKABLE
|
295 VFIO_IRQ_INFO_AUTOMASKED
);
297 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
299 return copy_to_user((void __user
*)arg
, &info
, minsz
);
301 } else if (cmd
== VFIO_DEVICE_SET_IRQS
) {
302 struct vfio_irq_set hdr
;
306 minsz
= offsetofend(struct vfio_irq_set
, count
);
308 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
311 if (hdr
.argsz
< minsz
|| hdr
.index
>= VFIO_PCI_NUM_IRQS
||
312 hdr
.flags
& ~(VFIO_IRQ_SET_DATA_TYPE_MASK
|
313 VFIO_IRQ_SET_ACTION_TYPE_MASK
))
316 if (!(hdr
.flags
& VFIO_IRQ_SET_DATA_NONE
)) {
319 if (hdr
.flags
& VFIO_IRQ_SET_DATA_BOOL
)
320 size
= sizeof(uint8_t);
321 else if (hdr
.flags
& VFIO_IRQ_SET_DATA_EVENTFD
)
322 size
= sizeof(int32_t);
326 if (hdr
.argsz
- minsz
< hdr
.count
* size
||
327 hdr
.count
> vfio_pci_get_irq_count(vdev
, hdr
.index
))
330 data
= kmalloc(hdr
.count
* size
, GFP_KERNEL
);
334 if (copy_from_user(data
, (void __user
*)(arg
+ minsz
),
341 mutex_lock(&vdev
->igate
);
343 ret
= vfio_pci_set_irqs_ioctl(vdev
, hdr
.flags
, hdr
.index
,
344 hdr
.start
, hdr
.count
, data
);
346 mutex_unlock(&vdev
->igate
);
351 } else if (cmd
== VFIO_DEVICE_RESET
)
352 return vdev
->reset_works
?
353 pci_reset_function(vdev
->pdev
) : -EINVAL
;
358 static ssize_t
vfio_pci_read(void *device_data
, char __user
*buf
,
359 size_t count
, loff_t
*ppos
)
361 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
362 struct vfio_pci_device
*vdev
= device_data
;
363 struct pci_dev
*pdev
= vdev
->pdev
;
365 if (index
>= VFIO_PCI_NUM_REGIONS
)
368 if (index
== VFIO_PCI_CONFIG_REGION_INDEX
)
369 return vfio_pci_config_readwrite(vdev
, buf
, count
, ppos
, false);
370 else if (index
== VFIO_PCI_ROM_REGION_INDEX
)
371 return vfio_pci_mem_readwrite(vdev
, buf
, count
, ppos
, false);
372 else if (pci_resource_flags(pdev
, index
) & IORESOURCE_IO
)
373 return vfio_pci_io_readwrite(vdev
, buf
, count
, ppos
, false);
374 else if (pci_resource_flags(pdev
, index
) & IORESOURCE_MEM
)
375 return vfio_pci_mem_readwrite(vdev
, buf
, count
, ppos
, false);
380 static ssize_t
vfio_pci_write(void *device_data
, const char __user
*buf
,
381 size_t count
, loff_t
*ppos
)
383 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
384 struct vfio_pci_device
*vdev
= device_data
;
385 struct pci_dev
*pdev
= vdev
->pdev
;
387 if (index
>= VFIO_PCI_NUM_REGIONS
)
390 if (index
== VFIO_PCI_CONFIG_REGION_INDEX
)
391 return vfio_pci_config_readwrite(vdev
, (char __user
*)buf
,
393 else if (index
== VFIO_PCI_ROM_REGION_INDEX
)
395 else if (pci_resource_flags(pdev
, index
) & IORESOURCE_IO
)
396 return vfio_pci_io_readwrite(vdev
, (char __user
*)buf
,
398 else if (pci_resource_flags(pdev
, index
) & IORESOURCE_MEM
) {
399 return vfio_pci_mem_readwrite(vdev
, (char __user
*)buf
,
406 static int vfio_pci_mmap(void *device_data
, struct vm_area_struct
*vma
)
408 struct vfio_pci_device
*vdev
= device_data
;
409 struct pci_dev
*pdev
= vdev
->pdev
;
411 u64 phys_len
, req_len
, pgoff
, req_start
;
414 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
416 if (vma
->vm_end
< vma
->vm_start
)
418 if ((vma
->vm_flags
& VM_SHARED
) == 0)
420 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
422 if (!(pci_resource_flags(pdev
, index
) & IORESOURCE_MEM
))
425 phys_len
= pci_resource_len(pdev
, index
);
426 req_len
= vma
->vm_end
- vma
->vm_start
;
427 pgoff
= vma
->vm_pgoff
&
428 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
429 req_start
= pgoff
<< PAGE_SHIFT
;
431 if (phys_len
< PAGE_SIZE
|| req_start
+ req_len
> phys_len
)
434 if (index
== vdev
->msix_bar
) {
436 * Disallow mmaps overlapping the MSI-X table; users don't
437 * get to touch this directly. We could find somewhere
438 * else to map the overlap, but page granularity is only
439 * a recommendation, not a requirement, so the user needs
440 * to know which bits are real. Requiring them to mmap
441 * around the table makes that clear.
444 /* If neither entirely above nor below, then it overlaps */
445 if (!(req_start
>= vdev
->msix_offset
+ vdev
->msix_size
||
446 req_start
+ req_len
<= vdev
->msix_offset
))
451 * Even though we don't make use of the barmap for the mmap,
452 * we need to request the region and the barmap tracks that.
454 if (!vdev
->barmap
[index
]) {
455 ret
= pci_request_selected_regions(pdev
,
456 1 << index
, "vfio-pci");
460 vdev
->barmap
[index
] = pci_iomap(pdev
, index
, 0);
463 vma
->vm_private_data
= vdev
;
464 vma
->vm_flags
|= VM_IO
| VM_DONTEXPAND
| VM_DONTDUMP
;
465 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
466 vma
->vm_pgoff
= (pci_resource_start(pdev
, index
) >> PAGE_SHIFT
) + pgoff
;
468 return remap_pfn_range(vma
, vma
->vm_start
, vma
->vm_pgoff
,
469 req_len
, vma
->vm_page_prot
);
472 static const struct vfio_device_ops vfio_pci_ops
= {
474 .open
= vfio_pci_open
,
475 .release
= vfio_pci_release
,
476 .ioctl
= vfio_pci_ioctl
,
477 .read
= vfio_pci_read
,
478 .write
= vfio_pci_write
,
479 .mmap
= vfio_pci_mmap
,
482 static int vfio_pci_probe(struct pci_dev
*pdev
, const struct pci_device_id
*id
)
485 struct vfio_pci_device
*vdev
;
486 struct iommu_group
*group
;
489 pci_read_config_byte(pdev
, PCI_HEADER_TYPE
, &type
);
490 if ((type
& PCI_HEADER_TYPE
) != PCI_HEADER_TYPE_NORMAL
)
493 group
= iommu_group_get(&pdev
->dev
);
497 vdev
= kzalloc(sizeof(*vdev
), GFP_KERNEL
);
499 iommu_group_put(group
);
504 vdev
->irq_type
= VFIO_PCI_NUM_IRQS
;
505 mutex_init(&vdev
->igate
);
506 spin_lock_init(&vdev
->irqlock
);
507 atomic_set(&vdev
->refcnt
, 0);
509 ret
= vfio_add_group_dev(&pdev
->dev
, &vfio_pci_ops
, vdev
);
511 iommu_group_put(group
);
518 static void vfio_pci_remove(struct pci_dev
*pdev
)
520 struct vfio_pci_device
*vdev
;
522 vdev
= vfio_del_group_dev(&pdev
->dev
);
526 iommu_group_put(pdev
->dev
.iommu_group
);
530 static struct pci_driver vfio_pci_driver
= {
532 .id_table
= NULL
, /* only dynamic ids */
533 .probe
= vfio_pci_probe
,
534 .remove
= vfio_pci_remove
,
537 static void __exit
vfio_pci_cleanup(void)
539 pci_unregister_driver(&vfio_pci_driver
);
540 vfio_pci_virqfd_exit();
541 vfio_pci_uninit_perm_bits();
544 static int __init
vfio_pci_init(void)
548 /* Allocate shared config space permision data used by all devices */
549 ret
= vfio_pci_init_perm_bits();
553 /* Start the virqfd cleanup handler */
554 ret
= vfio_pci_virqfd_init();
558 /* Register and scan for devices */
559 ret
= pci_register_driver(&vfio_pci_driver
);
566 vfio_pci_virqfd_exit();
568 vfio_pci_uninit_perm_bits();
572 module_init(vfio_pci_init
);
573 module_exit(vfio_pci_cleanup
);
575 MODULE_VERSION(DRIVER_VERSION
);
576 MODULE_LICENSE("GPL v2");
577 MODULE_AUTHOR(DRIVER_AUTHOR
);
578 MODULE_DESCRIPTION(DRIVER_DESC
);