From e34a0425b8ef524355811e7408dc1d53d08dc538 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:01 -0300 Subject: [PATCH 01/77] vfio/pci: Split linux/vfio_pci_core.h The header in include/linux should have only the exported interface for other vfio_pci modules to use. Internal definitions for vfio_pci.ko should be in a "priv" header along side the .c files. Move the internal declarations out of vfio_pci_core.h. They either move to vfio_pci_priv.h or to the C file that is the only user. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/1-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 2 +- drivers/vfio/pci/vfio_pci_config.c | 2 +- drivers/vfio/pci/vfio_pci_core.c | 19 +++- drivers/vfio/pci/vfio_pci_igd.c | 2 +- drivers/vfio/pci/vfio_pci_intrs.c | 16 +++- drivers/vfio/pci/vfio_pci_priv.h | 106 +++++++++++++++++++++++ drivers/vfio/pci/vfio_pci_rdwr.c | 2 +- drivers/vfio/pci/vfio_pci_zdev.c | 2 +- include/linux/vfio_pci_core.h | 134 +---------------------------- 9 files changed, 145 insertions(+), 140 deletions(-) create mode 100644 drivers/vfio/pci/vfio_pci_priv.h diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 4d1a97415a27..d9b5c03f8d5b 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -25,7 +25,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #define DRIVER_AUTHOR "Alex Williamson " #define DRIVER_DESC "VFIO PCI - User Level meta-driver" diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 442d3ba4122b..5f43b28075ee 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -26,7 +26,7 @@ #include #include -#include +#include "vfio_pci_priv.h" /* Fake capability ID for standard config space */ #define PCI_CAP_ID_BASIC 0 diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index c8d3b0450fb3..04180a0836cc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -28,7 +28,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #define DRIVER_AUTHOR "Alex Williamson " #define DRIVER_DESC "core driver for VFIO based PCI devices" @@ -41,6 +41,23 @@ static bool disable_idle_d3; static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex); static LIST_HEAD(vfio_pci_sriov_pfs); +struct vfio_pci_dummy_resource { + struct resource resource; + int index; + struct list_head res_next; +}; + +struct vfio_pci_vf_token { + struct mutex lock; + uuid_t uuid; + int users; +}; + +struct vfio_pci_mmap_vma { + struct vm_area_struct *vma; + struct list_head vma_next; +}; + static inline bool vfio_vga_disabled(void) { #ifdef CONFIG_VFIO_PCI_VGA diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 352c725ccf18..8177e9a1da3b 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -15,7 +15,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #define OPREGION_SIGNATURE "IntelGraphicsMem" #define OPREGION_SIZE (8 * 1024) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 6069a11fb51a..32d014421c1f 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -20,7 +20,21 @@ #include #include -#include +#include "vfio_pci_priv.h" + +#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) +#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) +#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) +#define irq_is(vdev, type) (vdev->irq_type == type) + +struct vfio_pci_irq_ctx { + struct eventfd_ctx *trigger; + struct virqfd *unmask; + struct virqfd *mask; + char *name; + bool masked; + struct irq_bypass_producer producer; +}; /* * INTx diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h new file mode 100644 index 000000000000..ac701f05bef0 --- /dev/null +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef VFIO_PCI_PRIV_H +#define VFIO_PCI_PRIV_H + +#include + +/* Special capability IDs predefined access */ +#define PCI_CAP_ID_INVALID 0xFF /* default raw access */ +#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ + +/* Cap maximum number of ioeventfds per device (arbitrary) */ +#define VFIO_PCI_IOEVENTFD_MAX 1000 + +struct vfio_pci_ioeventfd { + struct list_head next; + struct vfio_pci_core_device *vdev; + struct virqfd *virqfd; + void __iomem *addr; + uint64_t data; + loff_t pos; + int bar; + int count; + bool test_mem; +}; + +#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) + +void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); +void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); + +int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, + unsigned index, unsigned start, unsigned count, + void *data); + +ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); + +#ifdef CONFIG_VFIO_PCI_VGA +ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite); +#else +static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + return -EINVAL; +} +#endif + +long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, + uint64_t data, int count, int fd); + +int vfio_pci_init_perm_bits(void); +void vfio_pci_uninit_perm_bits(void); + +int vfio_config_init(struct vfio_pci_core_device *vdev); +void vfio_config_free(struct vfio_pci_core_device *vdev); + +int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, + pci_power_t state); + +bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); +void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); +u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); +void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, + u16 cmd); + +#ifdef CONFIG_VFIO_PCI_IGD +int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); +#else +static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) +{ + return -ENODEV; +} +#endif + +#ifdef CONFIG_VFIO_PCI_ZDEV_KVM +int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps); +int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); +void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); +#else +static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, + struct vfio_info_cap *caps) +{ + return -ENODEV; +} + +static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) +{ + return 0; +} + +static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) +{} +#endif + +static inline bool vfio_pci_is_vga(struct pci_dev *pdev) +{ + return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; +} + +#endif diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 82ac1569deb0..d5e9883c1eee 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -17,7 +17,7 @@ #include #include -#include +#include "vfio_pci_priv.h" #ifdef __LITTLE_ENDIAN #define vfio_ioread64 ioread64 diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index e163aa9f6144..0bff24f0d4d7 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -15,7 +15,7 @@ #include #include -#include +#include "vfio_pci_priv.h" /* * Add the Base PCI Function information to the device info region. diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 5579ece4347b..9d18b832e61a 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -20,39 +20,10 @@ #define VFIO_PCI_CORE_H #define VFIO_PCI_OFFSET_SHIFT 40 - #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT) #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1) -/* Special capability IDs predefined access */ -#define PCI_CAP_ID_INVALID 0xFF /* default raw access */ -#define PCI_CAP_ID_INVALID_VIRT 0xFE /* default virt access */ - -/* Cap maximum number of ioeventfds per device (arbitrary) */ -#define VFIO_PCI_IOEVENTFD_MAX 1000 - -struct vfio_pci_ioeventfd { - struct list_head next; - struct vfio_pci_core_device *vdev; - struct virqfd *virqfd; - void __iomem *addr; - uint64_t data; - loff_t pos; - int bar; - int count; - bool test_mem; -}; - -struct vfio_pci_irq_ctx { - struct eventfd_ctx *trigger; - struct virqfd *unmask; - struct virqfd *mask; - char *name; - bool masked; - struct irq_bypass_producer producer; -}; - struct vfio_pci_core_device; struct vfio_pci_region; @@ -78,23 +49,6 @@ struct vfio_pci_region { u32 flags; }; -struct vfio_pci_dummy_resource { - struct resource resource; - int index; - struct list_head res_next; -}; - -struct vfio_pci_vf_token { - struct mutex lock; - uuid_t uuid; - int users; -}; - -struct vfio_pci_mmap_vma { - struct vm_area_struct *vma; - struct list_head vma_next; -}; - struct vfio_pci_core_device { struct vfio_device vdev; struct pci_dev *pdev; @@ -141,92 +95,11 @@ struct vfio_pci_core_device { struct rw_semaphore memory_lock; }; -#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) -#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) -#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) -#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) -#define irq_is(vdev, type) (vdev->irq_type == type) - -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); -void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); - -int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, - uint32_t flags, unsigned index, - unsigned start, unsigned count, void *data); - -ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite); - -ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite); - -#ifdef CONFIG_VFIO_PCI_VGA -ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite); -#else -static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, - char __user *buf, size_t count, - loff_t *ppos, bool iswrite) -{ - return -EINVAL; -} -#endif - -long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, - uint64_t data, int count, int fd); - -int vfio_pci_init_perm_bits(void); -void vfio_pci_uninit_perm_bits(void); - -int vfio_config_init(struct vfio_pci_core_device *vdev); -void vfio_config_free(struct vfio_pci_core_device *vdev); - +/* Will be exported for vfio pci drivers usage */ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, unsigned int type, unsigned int subtype, const struct vfio_pci_regops *ops, size_t size, u32 flags, void *data); - -int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, - pci_power_t state); - -bool __vfio_pci_memory_enabled(struct vfio_pci_core_device *vdev); -void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev); -u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev); -void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, - u16 cmd); - -#ifdef CONFIG_VFIO_PCI_IGD -int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); -#else -static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) -{ - return -ENODEV; -} -#endif - -#ifdef CONFIG_VFIO_PCI_ZDEV_KVM -int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, - struct vfio_info_cap *caps); -int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev); -void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev); -#else -static inline int vfio_pci_info_zdev_add_caps(struct vfio_pci_core_device *vdev, - struct vfio_info_cap *caps) -{ - return -ENODEV; -} - -static inline int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) -{ - return 0; -} - -static inline void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) -{} -#endif - -/* Will be exported for vfio pci drivers usage */ void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); @@ -256,9 +129,4 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); -static inline bool vfio_pci_is_vga(struct pci_dev *pdev) -{ - return (pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA; -} - #endif /* VFIO_PCI_CORE_H */ From 1e979ef5df8b7b604a625343a179b812a7984068 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:02 -0300 Subject: [PATCH 02/77] vfio/pci: Rename vfio_pci_register_dev_region() As this is part of the vfio_pci_core component it should be called vfio_pci_core_register_dev_region() like everything else exported from this module. Suggested-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/2-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 10 +++++----- drivers/vfio/pci/vfio_pci_igd.c | 6 +++--- include/linux/vfio_pci_core.h | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 04180a0836cc..84279b6941bc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -662,10 +662,10 @@ static int msix_mmappable_cap(struct vfio_pci_core_device *vdev, return vfio_info_add_capability(caps, &header, sizeof(header)); } -int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, - unsigned int type, unsigned int subtype, - const struct vfio_pci_regops *ops, - size_t size, u32 flags, void *data) +int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data) { struct vfio_pci_region *region; @@ -687,7 +687,7 @@ int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, return 0; } -EXPORT_SYMBOL_GPL(vfio_pci_register_dev_region); +EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 8177e9a1da3b..5e6ca5926954 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -257,7 +257,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev) } } - ret = vfio_pci_register_dev_region(vdev, + ret = vfio_pci_core_register_dev_region(vdev, PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &vfio_pci_igd_regops, size, VFIO_REGION_INFO_FLAG_READ, opregionvbt); @@ -402,7 +402,7 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) return -EINVAL; } - ret = vfio_pci_register_dev_region(vdev, + ret = vfio_pci_core_register_dev_region(vdev, PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG, &vfio_pci_igd_cfg_regops, host_bridge->cfg_size, @@ -422,7 +422,7 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) return -EINVAL; } - ret = vfio_pci_register_dev_region(vdev, + ret = vfio_pci_core_register_dev_region(vdev, PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG, &vfio_pci_igd_cfg_regops, lpc_bridge->cfg_size, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 9d18b832e61a..e5cf0d3313a6 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -96,10 +96,10 @@ struct vfio_pci_core_device { }; /* Will be exported for vfio pci drivers usage */ -int vfio_pci_register_dev_region(struct vfio_pci_core_device *vdev, - unsigned int type, unsigned int subtype, - const struct vfio_pci_regops *ops, - size_t size, u32 flags, void *data); +int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, + unsigned int type, unsigned int subtype, + const struct vfio_pci_regops *ops, + size_t size, u32 flags, void *data); void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); From c462a8c5d98877b76cf229d3d605d2a865aa9c9e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Aug 2022 16:34:03 -0300 Subject: [PATCH 03/77] vfio/pci: Simplify the is_intx/msi/msix/etc defines Only three of these are actually used, simplify to three inline functions, and open code the if statement in vfio_pci_config.c. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Cornelia Huck Link: https://lore.kernel.org/r/3-v2-1bd95d72f298+e0e-vfio_pci_priv_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 2 +- drivers/vfio/pci/vfio_pci_intrs.c | 22 +++++++++++++++++----- drivers/vfio/pci/vfio_pci_priv.h | 2 -- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 5f43b28075ee..4a350421c5f6 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1166,7 +1166,7 @@ static int vfio_msi_config_write(struct vfio_pci_core_device *vdev, int pos, flags = le16_to_cpu(*pflags); /* MSI is enabled via ioctl */ - if (!is_msi(vdev)) + if (vdev->irq_type != VFIO_PCI_MSI_IRQ_INDEX) flags &= ~PCI_MSI_FLAGS_ENABLE; /* Check queue size */ diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 32d014421c1f..8cb987ef3c47 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -22,11 +22,6 @@ #include "vfio_pci_priv.h" -#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) -#define is_msix(vdev) (vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX) -#define is_irq_none(vdev) (!(is_intx(vdev) || is_msi(vdev) || is_msix(vdev))) -#define irq_is(vdev, type) (vdev->irq_type == type) - struct vfio_pci_irq_ctx { struct eventfd_ctx *trigger; struct virqfd *unmask; @@ -36,6 +31,23 @@ struct vfio_pci_irq_ctx { struct irq_bypass_producer producer; }; +static bool irq_is(struct vfio_pci_core_device *vdev, int type) +{ + return vdev->irq_type == type; +} + +static bool is_intx(struct vfio_pci_core_device *vdev) +{ + return vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX; +} + +static bool is_irq_none(struct vfio_pci_core_device *vdev) +{ + return !(vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX || + vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX || + vdev->irq_type == VFIO_PCI_MSIX_IRQ_INDEX); +} + /* * INTx */ diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index ac701f05bef0..4830fb01a1ca 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -23,8 +23,6 @@ struct vfio_pci_ioeventfd { bool test_mem; }; -#define is_msi(vdev) (vdev->irq_type == VFIO_PCI_MSI_IRQ_INDEX) - void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); From 16f4cbd9e156193312db19a68fe37c09854f77a8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:56 -0300 Subject: [PATCH 04/77] vfio-pci: Fix vfio_pci_ioeventfd() to return int This only returns 0 or -ERRNO, it should return int like all the other ioctl dispatch functions. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_priv.h | 4 ++-- drivers/vfio/pci/vfio_pci_rdwr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 4830fb01a1ca..58b8d34c162c 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -48,8 +48,8 @@ static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, } #endif -long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, - uint64_t data, int count, int fd); +int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, + uint64_t data, int count, int fd); int vfio_pci_init_perm_bits(void); void vfio_pci_uninit_perm_bits(void); diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index d5e9883c1eee..e352a033b4ae 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -412,8 +412,8 @@ static void vfio_pci_ioeventfd_thread(void *opaque, void *unused) vfio_pci_ioeventfd_do_write(ioeventfd, ioeventfd->test_mem); } -long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, - uint64_t data, int count, int fd) +int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, + uint64_t data, int count, int fd) { struct pci_dev *pdev = vdev->pdev; loff_t pos = offset & VFIO_PCI_OFFSET_MASK; From 2ecf3b58ed7bc52ad58e02bb1596130fa6e6da53 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:57 -0300 Subject: [PATCH 05/77] vfio-pci: Break up vfio_pci_core_ioctl() into one function per ioctl 500 lines is a bit long for a single function, move the bodies of each ioctl into separate functions and leave behind a switch statement to dispatch them. This patch just adds the function declarations and does not fix the indenting. The next patch will restore the indenting. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 97 ++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 84279b6941bc..85b9720e77d2 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -689,21 +689,15 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, } EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); -long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) +static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, + void __user *arg) { - struct vfio_pci_core_device *vdev = - container_of(core_vdev, struct vfio_pci_core_device, vdev); - unsigned long minsz; - - if (cmd == VFIO_DEVICE_GET_INFO) { + unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); struct vfio_device_info info; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; unsigned long capsz; int ret; - minsz = offsetofend(struct vfio_device_info, num_irqs); - /* For backward compatibility, cannot require this */ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); @@ -752,15 +746,17 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) { +static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; struct vfio_region_info info; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - minsz = offsetofend(struct vfio_region_info, offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; @@ -897,12 +893,14 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) { +static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_info, count); struct vfio_irq_info info; - minsz = offsetofend(struct vfio_irq_info, count); - if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; @@ -933,15 +931,17 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; +} - } else if (cmd == VFIO_DEVICE_SET_IRQS) { +static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_irq_set, count); struct vfio_irq_set hdr; u8 *data = NULL; int max, ret = 0; size_t data_size = 0; - minsz = offsetofend(struct vfio_irq_set, count); - if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; @@ -968,8 +968,11 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, kfree(data); return ret; +} - } else if (cmd == VFIO_DEVICE_RESET) { +static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, + void __user *arg) +{ int ret; if (!vdev->reset_works) @@ -993,16 +996,20 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, up_write(&vdev->memory_lock); return ret; +} - } else if (cmd == VFIO_DEVICE_GET_PCI_HOT_RESET_INFO) { +static int +vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = + offsetofend(struct vfio_pci_hot_reset_info, count); struct vfio_pci_hot_reset_info hdr; struct vfio_pci_fill_info fill = { 0 }; struct vfio_pci_dependent_device *devices = NULL; bool slot = false; int ret = 0; - minsz = offsetofend(struct vfio_pci_hot_reset_info, count); - if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; @@ -1066,8 +1073,12 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, kfree(devices); return ret; +} - } else if (cmd == VFIO_DEVICE_PCI_HOT_RESET) { +static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); struct vfio_pci_hot_reset hdr; int32_t *group_fds; struct file **files; @@ -1075,8 +1086,6 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, bool slot = false; int file_idx, count = 0, ret = 0; - minsz = offsetofend(struct vfio_pci_hot_reset, count); - if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; @@ -1160,12 +1169,15 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, kfree(files); return ret; - } else if (cmd == VFIO_DEVICE_IOEVENTFD) { +} + +static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, + void __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd); struct vfio_device_ioeventfd ioeventfd; int count; - minsz = offsetofend(struct vfio_device_ioeventfd, fd); - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) return -EFAULT; @@ -1182,8 +1194,35 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, ioeventfd.fd); +} + +long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, + unsigned long arg) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_DEVICE_GET_INFO: + return vfio_pci_ioctl_get_info(vdev, uarg); + case VFIO_DEVICE_GET_IRQ_INFO: + return vfio_pci_ioctl_get_irq_info(vdev, uarg); + case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO: + return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg); + case VFIO_DEVICE_GET_REGION_INFO: + return vfio_pci_ioctl_get_region_info(vdev, uarg); + case VFIO_DEVICE_IOEVENTFD: + return vfio_pci_ioctl_ioeventfd(vdev, uarg); + case VFIO_DEVICE_PCI_HOT_RESET: + return vfio_pci_ioctl_pci_hot_reset(vdev, uarg); + case VFIO_DEVICE_RESET: + return vfio_pci_ioctl_reset(vdev, uarg); + case VFIO_DEVICE_SET_IRQS: + return vfio_pci_ioctl_set_irqs(vdev, uarg); + default: + return -ENOTTY; } - return -ENOTTY; } EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); From ea3fc04d4fad2d31adb8a25115d4bd53b214bfc4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:58 -0300 Subject: [PATCH 06/77] vfio-pci: Re-indent what was vfio_pci_core_ioctl() Done mechanically with: $ git clang-format-14 -i --lines 675:1210 drivers/vfio/pci/vfio_pci_core.c And manually reflow the multi-line comments clang-format doesn't fix. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 745 +++++++++++++++---------------- 1 file changed, 366 insertions(+), 379 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 85b9720e77d2..8bff8ab5e807 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -693,309 +693,300 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); - struct vfio_device_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; - int ret; + struct vfio_device_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + unsigned long capsz; + int ret; - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); + /* For backward compatibility, cannot require this */ + capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; - } + if (info.argsz >= capsz) { + minsz = capsz; + info.cap_offset = 0; + } - info.flags = VFIO_DEVICE_FLAGS_PCI; + info.flags = VFIO_DEVICE_FLAGS_PCI; - if (vdev->reset_works) - info.flags |= VFIO_DEVICE_FLAGS_RESET; + if (vdev->reset_works) + info.flags |= VFIO_DEVICE_FLAGS_RESET; - info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; - info.num_irqs = VFIO_PCI_NUM_IRQS; + info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions; + info.num_irqs = VFIO_PCI_NUM_IRQS; - ret = vfio_pci_info_zdev_add_caps(vdev, &caps); - if (ret && ret != -ENODEV) { - pci_warn(vdev->pdev, "Failed to setup zPCI info capabilities\n"); - return ret; - } + ret = vfio_pci_info_zdev_add_caps(vdev, &caps); + if (ret && ret != -ENODEV) { + pci_warn(vdev->pdev, + "Failed to setup zPCI info capabilities\n"); + return ret; + } - if (caps.size) { - info.flags |= VFIO_DEVICE_FLAGS_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); + if (caps.size) { + info.flags |= VFIO_DEVICE_FLAGS_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; } - - kfree(caps.buf); + info.cap_offset = sizeof(info); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_region_info, offset); - struct pci_dev *pdev = vdev->pdev; - struct vfio_region_info info; - struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - int i, ret; + struct pci_dev *pdev = vdev->pdev; + struct vfio_region_info info; + struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; + int i, ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + if (info.argsz < minsz) + return -EINVAL; - switch (info.index) { - case VFIO_PCI_CONFIG_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pdev->cfg_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - break; - case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - info.flags = 0; - break; - } - - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - if (vdev->bar_mmap_supported[info.index]) { - info.flags |= VFIO_REGION_INFO_FLAG_MMAP; - if (info.index == vdev->msix_bar) { - ret = msix_mmappable_cap(vdev, &caps); - if (ret) - return ret; - } - } - - break; - case VFIO_PCI_ROM_REGION_INDEX: - { - void __iomem *io; - size_t size; - u16 cmd; - - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + switch (info.index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pdev->cfg_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = pci_resource_len(pdev, info.index); + if (!info.size) { info.flags = 0; - - /* Report the BAR size, not the ROM size */ - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - /* Shadow ROMs appear as PCI option ROMs */ - if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) - info.size = 0x20000; - else - break; - } - - /* - * Is it really there? Enable memory decode for - * implicit access in pci_map_rom(). - */ - cmd = vfio_pci_memory_lock_and_enable(vdev); - io = pci_map_rom(pdev, &size); - if (io) { - info.flags = VFIO_REGION_INFO_FLAG_READ; - pci_unmap_rom(pdev, io); - } else { - info.size = 0; - } - vfio_pci_memory_unlock_and_restore(vdev, cmd); - break; } - case VFIO_PCI_VGA_REGION_INDEX: - if (!vdev->has_vga) - return -EINVAL; - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = 0xc0000; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - - break; - default: - { - struct vfio_region_info_cap_type cap_type = { - .header.id = VFIO_REGION_INFO_CAP_TYPE, - .header.version = 1 }; - - if (info.index >= - VFIO_PCI_NUM_REGIONS + vdev->num_regions) - return -EINVAL; - info.index = array_index_nospec(info.index, - VFIO_PCI_NUM_REGIONS + - vdev->num_regions); - - i = info.index - VFIO_PCI_NUM_REGIONS; - - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = vdev->region[i].size; - info.flags = vdev->region[i].flags; - - cap_type.type = vdev->region[i].type; - cap_type.subtype = vdev->region[i].subtype; - - ret = vfio_info_add_capability(&caps, &cap_type.header, - sizeof(cap_type)); - if (ret) - return ret; - - if (vdev->region[i].ops->add_capability) { - ret = vdev->region[i].ops->add_capability(vdev, - &vdev->region[i], &caps); + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + if (vdev->bar_mmap_supported[info.index]) { + info.flags |= VFIO_REGION_INFO_FLAG_MMAP; + if (info.index == vdev->msix_bar) { + ret = msix_mmappable_cap(vdev, &caps); if (ret) return ret; } } + + break; + case VFIO_PCI_ROM_REGION_INDEX: { + void __iomem *io; + size_t size; + u16 cmd; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.flags = 0; + + /* Report the BAR size, not the ROM size */ + info.size = pci_resource_len(pdev, info.index); + if (!info.size) { + /* Shadow ROMs appear as PCI option ROMs */ + if (pdev->resource[PCI_ROM_RESOURCE].flags & + IORESOURCE_ROM_SHADOW) + info.size = 0x20000; + else + break; } - if (caps.size) { - info.flags |= VFIO_REGION_INFO_FLAG_CAPS; - if (info.argsz < sizeof(info) + caps.size) { - info.argsz = sizeof(info) + caps.size; - info.cap_offset = 0; - } else { - vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + - sizeof(info), caps.buf, - caps.size)) { - kfree(caps.buf); - return -EFAULT; - } - info.cap_offset = sizeof(info); + /* + * Is it really there? Enable memory decode for implicit access + * in pci_map_rom(). + */ + cmd = vfio_pci_memory_lock_and_enable(vdev); + io = pci_map_rom(pdev, &size); + if (io) { + info.flags = VFIO_REGION_INFO_FLAG_READ; + pci_unmap_rom(pdev, io); + } else { + info.size = 0; + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); + + break; + } + case VFIO_PCI_VGA_REGION_INDEX: + if (!vdev->has_vga) + return -EINVAL; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = 0xc0000; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + + break; + default: { + struct vfio_region_info_cap_type cap_type = { + .header.id = VFIO_REGION_INFO_CAP_TYPE, + .header.version = 1 + }; + + if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) + return -EINVAL; + info.index = array_index_nospec( + info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions); + + i = info.index - VFIO_PCI_NUM_REGIONS; + + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = vdev->region[i].size; + info.flags = vdev->region[i].flags; + + cap_type.type = vdev->region[i].type; + cap_type.subtype = vdev->region[i].subtype; + + ret = vfio_info_add_capability(&caps, &cap_type.header, + sizeof(cap_type)); + if (ret) + return ret; + + if (vdev->region[i].ops->add_capability) { + ret = vdev->region[i].ops->add_capability( + vdev, &vdev->region[i], &caps); + if (ret) + return ret; + } + } + } + + if (caps.size) { + info.flags |= VFIO_REGION_INFO_FLAG_CAPS; + if (info.argsz < sizeof(info) + caps.size) { + info.argsz = sizeof(info) + caps.size; + info.cap_offset = 0; + } else { + vfio_info_cap_shift(&caps, sizeof(info)); + if (copy_to_user((void __user *)arg + sizeof(info), + caps.buf, caps.size)) { + kfree(caps.buf); + return -EFAULT; } - - kfree(caps.buf); + info.cap_offset = sizeof(info); } - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + kfree(caps.buf); + } + + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_info, count); - struct vfio_irq_info info; + struct vfio_irq_info info; - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; - if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) - return -EINVAL; + if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) + return -EINVAL; - switch (info.index) { - case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: - case VFIO_PCI_REQ_IRQ_INDEX: + switch (info.index) { + case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: + break; + case VFIO_PCI_ERR_IRQ_INDEX: + if (pci_is_pcie(vdev->pdev)) break; - case VFIO_PCI_ERR_IRQ_INDEX: - if (pci_is_pcie(vdev->pdev)) - break; - fallthrough; - default: - return -EINVAL; - } + fallthrough; + default: + return -EINVAL; + } - info.flags = VFIO_IRQ_INFO_EVENTFD; + info.flags = VFIO_IRQ_INFO_EVENTFD; - info.count = vfio_pci_get_irq_count(vdev, info.index); + info.count = vfio_pci_get_irq_count(vdev, info.index); - if (info.index == VFIO_PCI_INTX_IRQ_INDEX) - info.flags |= (VFIO_IRQ_INFO_MASKABLE | - VFIO_IRQ_INFO_AUTOMASKED); - else - info.flags |= VFIO_IRQ_INFO_NORESIZE; + if (info.index == VFIO_PCI_INTX_IRQ_INDEX) + info.flags |= + (VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED); + else + info.flags |= VFIO_IRQ_INFO_NORESIZE; - return copy_to_user((void __user *)arg, &info, minsz) ? - -EFAULT : 0; + return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_set, count); - struct vfio_irq_set hdr; - u8 *data = NULL; - int max, ret = 0; - size_t data_size = 0; + struct vfio_irq_set hdr; + u8 *data = NULL; + int max, ret = 0; + size_t data_size = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; - max = vfio_pci_get_irq_count(vdev, hdr.index); - - ret = vfio_set_irqs_validate_and_prepare(&hdr, max, - VFIO_PCI_NUM_IRQS, &data_size); - if (ret) - return ret; - - if (data_size) { - data = memdup_user((void __user *)(arg + minsz), - data_size); - if (IS_ERR(data)) - return PTR_ERR(data); - } - - mutex_lock(&vdev->igate); - - ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, - hdr.start, hdr.count, data); - - mutex_unlock(&vdev->igate); - kfree(data); + max = vfio_pci_get_irq_count(vdev, hdr.index); + ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS, + &data_size); + if (ret) return ret; + + if (data_size) { + data = memdup_user((void __user *)(arg + minsz), data_size); + if (IS_ERR(data)) + return PTR_ERR(data); + } + + mutex_lock(&vdev->igate); + + ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start, + hdr.count, data); + + mutex_unlock(&vdev->igate); + kfree(data); + + return ret; } static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, void __user *arg) { - int ret; + int ret; - if (!vdev->reset_works) - return -EINVAL; + if (!vdev->reset_works) + return -EINVAL; - vfio_pci_zap_and_down_write_memory_lock(vdev); + vfio_pci_zap_and_down_write_memory_lock(vdev); - /* - * This function can be invoked while the power state is non-D0. - * If pci_try_reset_function() has been called while the power - * state is non-D0, then pci_try_reset_function() will - * internally set the power state to D0 without vfio driver - * involvement. For the devices which have NoSoftRst-, the - * reset function can cause the PCI config space reset without - * restoring the original state (saved locally in - * 'vdev->pm_save'). - */ - vfio_pci_set_power_state(vdev, PCI_D0); + /* + * This function can be invoked while the power state is non-D0. If + * pci_try_reset_function() has been called while the power state is + * non-D0, then pci_try_reset_function() will internally set the power + * state to D0 without vfio driver involvement. For the devices which + * have NoSoftRst-, the reset function can cause the PCI config space + * reset without restoring the original state (saved locally in + * 'vdev->pm_save'). + */ + vfio_pci_set_power_state(vdev, PCI_D0); - ret = pci_try_reset_function(vdev->pdev); - up_write(&vdev->memory_lock); + ret = pci_try_reset_function(vdev->pdev); + up_write(&vdev->memory_lock); - return ret; + return ret; } static int @@ -1004,196 +995,192 @@ vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); - struct vfio_pci_hot_reset_info hdr; - struct vfio_pci_fill_info fill = { 0 }; - struct vfio_pci_dependent_device *devices = NULL; - bool slot = false; - int ret = 0; + struct vfio_pci_hot_reset_info hdr; + struct vfio_pci_fill_info fill = { 0 }; + struct vfio_pci_dependent_device *devices = NULL; + bool slot = false; + int ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; - if (hdr.argsz < minsz) - return -EINVAL; + if (hdr.argsz < minsz) + return -EINVAL; - hdr.flags = 0; + hdr.flags = 0; - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; - /* How many devices are affected? */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_count_devs, - &fill.max, slot); - if (ret) - return ret; + /* How many devices are affected? */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &fill.max, slot); + if (ret) + return ret; - WARN_ON(!fill.max); /* Should always be at least one */ + WARN_ON(!fill.max); /* Should always be at least one */ - /* - * If there's enough space, fill it now, otherwise return - * -ENOSPC and the number of devices affected. - */ - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { - ret = -ENOSPC; - hdr.count = fill.max; - goto reset_info_exit; - } + /* + * If there's enough space, fill it now, otherwise return -ENOSPC and + * the number of devices affected. + */ + if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { + ret = -ENOSPC; + hdr.count = fill.max; + goto reset_info_exit; + } - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; + devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); + if (!devices) + return -ENOMEM; - fill.devices = devices; + fill.devices = devices; - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_fill_devs, - &fill, slot); + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, + &fill, slot); - /* - * If a device was removed between counting and filling, - * we may come up short of fill.max. If a device was - * added, we'll have a return of -EAGAIN above. - */ - if (!ret) - hdr.count = fill.cur; + /* + * If a device was removed between counting and filling, we may come up + * short of fill.max. If a device was added, we'll have a return of + * -EAGAIN above. + */ + if (!ret) + hdr.count = fill.cur; reset_info_exit: - if (copy_to_user((void __user *)arg, &hdr, minsz)) + if (copy_to_user((void __user *)arg, &hdr, minsz)) + ret = -EFAULT; + + if (!ret) { + if (copy_to_user((void __user *)(arg + minsz), devices, + hdr.count * sizeof(*devices))) ret = -EFAULT; + } - if (!ret) { - if (copy_to_user((void __user *)(arg + minsz), devices, - hdr.count * sizeof(*devices))) - ret = -EFAULT; - } - - kfree(devices); - return ret; + kfree(devices); + return ret; } static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); - struct vfio_pci_hot_reset hdr; - int32_t *group_fds; - struct file **files; - struct vfio_pci_group_info info; - bool slot = false; - int file_idx, count = 0, ret = 0; + struct vfio_pci_hot_reset hdr; + int32_t *group_fds; + struct file **files; + struct vfio_pci_group_info info; + bool slot = false; + int file_idx, count = 0, ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&hdr, (void __user *)arg, minsz)) + return -EFAULT; - if (hdr.argsz < minsz || hdr.flags) - return -EINVAL; + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; - /* - * We can't let userspace give us an arbitrarily large - * buffer to copy, so verify how many we think there - * could be. Note groups can have multiple devices so - * one group per device is the max. - */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, - vfio_pci_count_devs, - &count, slot); - if (ret) - return ret; + /* + * We can't let userspace give us an arbitrarily large buffer to copy, + * so verify how many we think there could be. Note groups can have + * multiple devices so one group per device is the max. + */ + ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, + &count, slot); + if (ret) + return ret; - /* Somewhere between 1 and count is OK */ - if (!hdr.count || hdr.count > count) - return -EINVAL; - - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); - if (!group_fds || !files) { - kfree(group_fds); - kfree(files); - return -ENOMEM; - } - - if (copy_from_user(group_fds, (void __user *)(arg + minsz), - hdr.count * sizeof(*group_fds))) { - kfree(group_fds); - kfree(files); - return -EFAULT; - } - - /* - * For each group_fd, get the group through the vfio external - * user interface and store the group and iommu ID. This - * ensures the group is held across the reset. - */ - for (file_idx = 0; file_idx < hdr.count; file_idx++) { - struct file *file = fget(group_fds[file_idx]); - - if (!file) { - ret = -EBADF; - break; - } - - /* Ensure the FD is a vfio group FD.*/ - if (!vfio_file_iommu_group(file)) { - fput(file); - ret = -EINVAL; - break; - } - - files[file_idx] = file; - } + /* Somewhere between 1 and count is OK */ + if (!hdr.count || hdr.count > count) + return -EINVAL; + group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); + files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + if (!group_fds || !files) { kfree(group_fds); + kfree(files); + return -ENOMEM; + } - /* release reference to groups on error */ - if (ret) - goto hot_reset_release; + if (copy_from_user(group_fds, (void __user *)(arg + minsz), + hdr.count * sizeof(*group_fds))) { + kfree(group_fds); + kfree(files); + return -EFAULT; + } - info.count = hdr.count; - info.files = files; + /* + * For each group_fd, get the group through the vfio external user + * interface and store the group and iommu ID. This ensures the group + * is held across the reset. + */ + for (file_idx = 0; file_idx < hdr.count; file_idx++) { + struct file *file = fget(group_fds[file_idx]); - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); + if (!file) { + ret = -EBADF; + break; + } + + /* Ensure the FD is a vfio group FD.*/ + if (!vfio_file_iommu_group(file)) { + fput(file); + ret = -EINVAL; + break; + } + + files[file_idx] = file; + } + + kfree(group_fds); + + /* release reference to groups on error */ + if (ret) + goto hot_reset_release; + + info.count = hdr.count; + info.files = files; + + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); hot_reset_release: - for (file_idx--; file_idx >= 0; file_idx--) - fput(files[file_idx]); + for (file_idx--; file_idx >= 0; file_idx--) + fput(files[file_idx]); - kfree(files); - return ret; + kfree(files); + return ret; } static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, void __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd); - struct vfio_device_ioeventfd ioeventfd; - int count; + struct vfio_device_ioeventfd ioeventfd; + int count; - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) - return -EFAULT; + if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) + return -EFAULT; - if (ioeventfd.argsz < minsz) - return -EINVAL; + if (ioeventfd.argsz < minsz) + return -EINVAL; - if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) - return -EINVAL; + if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK) + return -EINVAL; - count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; + count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK; - if (hweight8(count) != 1 || ioeventfd.fd < -1) - return -EINVAL; + if (hweight8(count) != 1 || ioeventfd.fd < -1) + return -EINVAL; - return vfio_pci_ioeventfd(vdev, ioeventfd.offset, - ioeventfd.data, count, ioeventfd.fd); + return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count, + ioeventfd.fd); } long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, From 663eab456e072bbcd02c2516d54b53f7ecd57dd3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:15:59 -0300 Subject: [PATCH 07/77] vfio-pci: Replace 'void __user *' with proper types in the ioctl functions This makes the code clearer and replaces a few places trying to access a flex array with an actual flex array. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 58 +++++++++++++++----------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 8bff8ab5e807..9273f1ffd0dd 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -690,7 +690,7 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region); static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_device_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); struct vfio_device_info info; @@ -701,7 +701,7 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, /* For backward compatibility, cannot require this */ capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -733,22 +733,21 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, info.argsz = sizeof(info) + caps.size; } else { vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + sizeof(info), - caps.buf, caps.size)) { + if (copy_to_user(arg + 1, caps.buf, caps.size)) { kfree(caps.buf); return -EFAULT; } - info.cap_offset = sizeof(info); + info.cap_offset = sizeof(*arg); } kfree(caps.buf); } - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_region_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_region_info, offset); struct pci_dev *pdev = vdev->pdev; @@ -756,7 +755,7 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; int i, ret; - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) @@ -875,27 +874,26 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, info.cap_offset = 0; } else { vfio_info_cap_shift(&caps, sizeof(info)); - if (copy_to_user((void __user *)arg + sizeof(info), - caps.buf, caps.size)) { + if (copy_to_user(arg + 1, caps.buf, caps.size)) { kfree(caps.buf); return -EFAULT; } - info.cap_offset = sizeof(info); + info.cap_offset = sizeof(*arg); } kfree(caps.buf); } - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_irq_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_info, count); struct vfio_irq_info info; - if (copy_from_user(&info, (void __user *)arg, minsz)) + if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS) @@ -923,11 +921,11 @@ static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev, else info.flags |= VFIO_IRQ_INFO_NORESIZE; - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user(arg, &info, minsz) ? -EFAULT : 0; } static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_irq_set __user *arg) { unsigned long minsz = offsetofend(struct vfio_irq_set, count); struct vfio_irq_set hdr; @@ -935,7 +933,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, int max, ret = 0; size_t data_size = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) + if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; max = vfio_pci_get_irq_count(vdev, hdr.index); @@ -946,7 +944,7 @@ static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev, return ret; if (data_size) { - data = memdup_user((void __user *)(arg + minsz), data_size); + data = memdup_user(&arg->data, data_size); if (IS_ERR(data)) return PTR_ERR(data); } @@ -989,9 +987,9 @@ static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev, return ret; } -static int -vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, - void __user *arg) +static int vfio_pci_ioctl_get_pci_hot_reset_info( + struct vfio_pci_core_device *vdev, + struct vfio_pci_hot_reset_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); @@ -1001,7 +999,7 @@ vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, bool slot = false; int ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) + if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; if (hdr.argsz < minsz) @@ -1051,11 +1049,11 @@ vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, hdr.count = fill.cur; reset_info_exit: - if (copy_to_user((void __user *)arg, &hdr, minsz)) + if (copy_to_user(arg, &hdr, minsz)) ret = -EFAULT; if (!ret) { - if (copy_to_user((void __user *)(arg + minsz), devices, + if (copy_to_user(&arg->devices, devices, hdr.count * sizeof(*devices))) ret = -EFAULT; } @@ -1065,7 +1063,7 @@ vfio_pci_ioctl_get_pci_hot_reset_info(struct vfio_pci_core_device *vdev, } static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_pci_hot_reset __user *arg) { unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); struct vfio_pci_hot_reset hdr; @@ -1075,7 +1073,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, bool slot = false; int file_idx, count = 0, ret = 0; - if (copy_from_user(&hdr, (void __user *)arg, minsz)) + if (copy_from_user(&hdr, arg, minsz)) return -EFAULT; if (hdr.argsz < minsz || hdr.flags) @@ -1109,7 +1107,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, return -ENOMEM; } - if (copy_from_user(group_fds, (void __user *)(arg + minsz), + if (copy_from_user(group_fds, arg->group_fds, hdr.count * sizeof(*group_fds))) { kfree(group_fds); kfree(files); @@ -1159,13 +1157,13 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, } static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, - void __user *arg) + struct vfio_device_ioeventfd __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd); struct vfio_device_ioeventfd ioeventfd; int count; - if (copy_from_user(&ioeventfd, (void __user *)arg, minsz)) + if (copy_from_user(&ioeventfd, arg, minsz)) return -EFAULT; if (ioeventfd.argsz < minsz) @@ -1214,7 +1212,7 @@ long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl); static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags, - void __user *arg, size_t argsz) + uuid_t __user *arg, size_t argsz) { struct vfio_pci_core_device *vdev = container_of(device, struct vfio_pci_core_device, vdev); From 150ee2f9cd9411a3fdbc55cef2fb01349216dbd7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:00 -0300 Subject: [PATCH 08/77] vfio: Fold VFIO_GROUP_GET_DEVICE_FD into vfio_group_get_device_fd() No reason to split it up like this, just have one function to process the ioctl. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 7cb56c382c97..3afef45b8d1a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1178,14 +1178,21 @@ static struct file *vfio_device_open(struct vfio_device *device) return ERR_PTR(ret); } -static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) +static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, + char __user *arg) { struct vfio_device *device; struct file *filep; + char *buf; int fdno; int ret; + buf = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + device = vfio_device_get_from_name(group, buf); + kfree(buf); if (IS_ERR(device)) return PTR_ERR(device); @@ -1215,9 +1222,12 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_group *group = filep->private_data; + void __user *uarg = (void __user *)arg; long ret = -ENOTTY; switch (cmd) { + case VFIO_GROUP_GET_DEVICE_FD: + return vfio_group_ioctl_get_device_fd(group, uarg); case VFIO_GROUP_GET_STATUS: { struct vfio_group_status status; @@ -1267,18 +1277,6 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, ret = vfio_group_unset_container(group); up_write(&group->group_rwsem); break; - case VFIO_GROUP_GET_DEVICE_FD: - { - char *buf; - - buf = strndup_user((const char __user *)arg, PAGE_SIZE); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - ret = vfio_group_get_device_fd(group, buf); - kfree(buf); - break; - } } return ret; From 67671f153e6b5a379623b57881a6cf99b4a6f977 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:01 -0300 Subject: [PATCH 09/77] vfio: Fold VFIO_GROUP_SET_CONTAINER into vfio_group_set_container() No reason to split it up like this, just have one function to process the ioctl. Move the lock into the function as well to avoid having a lockdep annotation. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 51 +++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 3afef45b8d1a..17c44ee81f9f 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -980,47 +980,54 @@ static int vfio_group_unset_container(struct vfio_group *group) return 0; } -static int vfio_group_set_container(struct vfio_group *group, int container_fd) +static int vfio_group_ioctl_set_container(struct vfio_group *group, + int __user *arg) { struct fd f; struct vfio_container *container; struct vfio_iommu_driver *driver; + int container_fd; int ret = 0; - lockdep_assert_held_write(&group->group_rwsem); - - if (group->container || WARN_ON(group->container_users)) - return -EINVAL; - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; + if (get_user(container_fd, arg)) + return -EFAULT; + if (container_fd < 0) + return -EINVAL; f = fdget(container_fd); if (!f.file) return -EBADF; /* Sanity check, is this really our fd? */ if (f.file->f_op != &vfio_fops) { - fdput(f); - return -EINVAL; + ret = -EINVAL; + goto out_fdput; } - container = f.file->private_data; WARN_ON(!container); /* fget ensures we don't race vfio_release */ + down_write(&group->group_rwsem); + + if (group->container || WARN_ON(group->container_users)) { + ret = -EINVAL; + goto out_unlock_group; + } + down_write(&container->group_lock); /* Real groups and fake groups cannot mix */ if (!list_empty(&container->group_list) && container->noiommu != (group->type == VFIO_NO_IOMMU)) { ret = -EPERM; - goto unlock_out; + goto out_unlock_container; } if (group->type == VFIO_IOMMU) { ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); if (ret) - goto unlock_out; + goto out_unlock_container; } driver = container->iommu_driver; @@ -1032,7 +1039,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) if (group->type == VFIO_IOMMU) iommu_group_release_dma_owner( group->iommu_group); - goto unlock_out; + goto out_unlock_container; } } @@ -1044,8 +1051,11 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) /* Get a reference on the container and mark a user within the group */ vfio_container_get(container); -unlock_out: +out_unlock_container: up_write(&container->group_lock); +out_unlock_group: + up_write(&group->group_rwsem); +out_fdput: fdput(f); return ret; } @@ -1258,20 +1268,7 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, break; } case VFIO_GROUP_SET_CONTAINER: - { - int fd; - - if (get_user(fd, (int __user *)arg)) - return -EFAULT; - - if (fd < 0) - return -EINVAL; - - down_write(&group->group_rwsem); - ret = vfio_group_set_container(group, fd); - up_write(&group->group_rwsem); - break; - } + return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: down_write(&group->group_rwsem); ret = vfio_group_unset_container(group); From b3b43590fa276aef824a300b911fe5fb9083dbf5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:02 -0300 Subject: [PATCH 10/77] vfio: Follow the naming pattern for vfio_group_ioctl_unset_container() Make it clear that this is the body of the ioctl. Fold the locking into the function so it is self contained like the other ioctls. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/7-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 17c44ee81f9f..0bb75416acfc 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -968,16 +968,24 @@ static void __vfio_group_unset_container(struct vfio_group *group) * the group, we know that still exists, therefore the only valid * transition here is 1->0. */ -static int vfio_group_unset_container(struct vfio_group *group) +static int vfio_group_ioctl_unset_container(struct vfio_group *group) { - lockdep_assert_held_write(&group->group_rwsem); + int ret = 0; - if (!group->container) - return -EINVAL; - if (group->container_users != 1) - return -EBUSY; + down_write(&group->group_rwsem); + if (!group->container) { + ret = -EINVAL; + goto out_unlock; + } + if (group->container_users != 1) { + ret = -EBUSY; + goto out_unlock; + } __vfio_group_unset_container(group); - return 0; + +out_unlock: + up_write(&group->group_rwsem); + return ret; } static int vfio_group_ioctl_set_container(struct vfio_group *group, @@ -1270,10 +1278,7 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, case VFIO_GROUP_SET_CONTAINER: return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: - down_write(&group->group_rwsem); - ret = vfio_group_unset_container(group); - up_write(&group->group_rwsem); - break; + return vfio_group_ioctl_unset_container(group); } return ret; From 99a27c088b9c76d9e0f2a36152ffaf9891b224d3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 31 Aug 2022 17:16:03 -0300 Subject: [PATCH 11/77] vfio: Split VFIO_GROUP_GET_STATUS into a function This is the last sizable implementation in vfio_group_fops_unl_ioctl(), move it to a function so vfio_group_fops_unl_ioctl() is emptied out. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v2-0f9e632d54fb+d6-vfio_ioctl_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 61 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 0bb75416acfc..eb714a484662 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1236,52 +1236,51 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, return ret; } +static int vfio_group_ioctl_get_status(struct vfio_group *group, + struct vfio_group_status __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_group_status, flags); + struct vfio_group_status status; + + if (copy_from_user(&status, arg, minsz)) + return -EFAULT; + + if (status.argsz < minsz) + return -EINVAL; + + status.flags = 0; + + down_read(&group->group_rwsem); + if (group->container) + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | + VFIO_GROUP_FLAGS_VIABLE; + else if (!iommu_group_dma_owner_claimed(group->iommu_group)) + status.flags |= VFIO_GROUP_FLAGS_VIABLE; + up_read(&group->group_rwsem); + + if (copy_to_user(arg, &status, minsz)) + return -EFAULT; + return 0; +} + static long vfio_group_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_group *group = filep->private_data; void __user *uarg = (void __user *)arg; - long ret = -ENOTTY; switch (cmd) { case VFIO_GROUP_GET_DEVICE_FD: return vfio_group_ioctl_get_device_fd(group, uarg); case VFIO_GROUP_GET_STATUS: - { - struct vfio_group_status status; - unsigned long minsz; - - minsz = offsetofend(struct vfio_group_status, flags); - - if (copy_from_user(&status, (void __user *)arg, minsz)) - return -EFAULT; - - if (status.argsz < minsz) - return -EINVAL; - - status.flags = 0; - - down_read(&group->group_rwsem); - if (group->container) - status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | - VFIO_GROUP_FLAGS_VIABLE; - else if (!iommu_group_dma_owner_claimed(group->iommu_group)) - status.flags |= VFIO_GROUP_FLAGS_VIABLE; - up_read(&group->group_rwsem); - - if (copy_to_user((void __user *)arg, &status, minsz)) - return -EFAULT; - - ret = 0; - break; - } + return vfio_group_ioctl_get_status(group, uarg); case VFIO_GROUP_SET_CONTAINER: return vfio_group_ioctl_set_container(group, uarg); case VFIO_GROUP_UNSET_CONTAINER: return vfio_group_ioctl_unset_container(group); + default: + return -ENOTTY; } - - return ret; } static int vfio_group_fops_open(struct inode *inode, struct file *filep) From 385ecfdfb5d5ad0ff37e20381c70e18af8cf1bdb Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:46 +0530 Subject: [PATCH 12/77] vfio: Add the device features for the low power entry and exit This patch adds the following new device features for the low power entry and exit in the header file. The implementation for the same will be added in the subsequent patches. - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP - VFIO_DEVICE_FEATURE_LOW_POWER_EXIT For vfio-pci based devices, with the standard PCI PM registers, all power states cannot be achieved. The platform-based power management needs to be involved to go into the lowest power state. For doing low power entry and exit with platform-based power management, these device features can be used. The entry device feature has two variants. These two variants are mainly to support the different behaviour for the low power entry. If there is any access for the VFIO device on the host side, then the device will be moved out of the low power state without the user's guest driver involvement. Some devices (for example NVIDIA VGA or 3D controller) require the user's guest driver involvement for each low-power entry. In the first variant, the host can return the device to low power automatically. The device will continue to attempt to reach low power until the low power exit feature is called. In the second variant, if the device exits low power due to an access, the host kernel will signal the user via the provided eventfd and will not return the device to low power without a subsequent call to one of the low power entry features. A call to the low power exit feature is optional if the user provided eventfd is signaled. These device features only support VFIO_DEVICE_FEATURE_SET and VFIO_DEVICE_FEATURE_PROBE operations. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-2-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 733a1cddde30..76a173f973de 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -986,6 +986,62 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_RUNNING_P2P = 5, }; +/* + * Upon VFIO_DEVICE_FEATURE_SET, allow the device to be moved into a low power + * state with the platform-based power management. Device use of lower power + * states depends on factors managed by the runtime power management core, + * including system level support and coordinating support among dependent + * devices. Enabling device low power entry does not guarantee lower power + * usage by the device, nor is a mechanism provided through this feature to + * know the current power state of the device. If any device access happens + * (either from the host or through the vfio uAPI) when the device is in the + * low power state, then the host will move the device out of the low power + * state as necessary prior to the access. Once the access is completed, the + * device may re-enter the low power state. For single shot low power support + * with wake-up notification, see + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP below. Access to mmap'd + * device regions is disabled on LOW_POWER_ENTRY and may only be resumed after + * calling LOW_POWER_EXIT. + */ +#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY 3 + +/* + * This device feature has the same behavior as + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY with the exception that the user + * provides an eventfd for wake-up notification. When the device moves out of + * the low power state for the wake-up, the host will not allow the device to + * re-enter a low power state without a subsequent user call to one of the low + * power entry device feature IOCTLs. Access to mmap'd device regions is + * disabled on LOW_POWER_ENTRY_WITH_WAKEUP and may only be resumed after the + * low power exit. The low power exit can happen either through LOW_POWER_EXIT + * or through any other access (where the wake-up notification has been + * generated). The access to mmap'd device regions will not trigger low power + * exit. + * + * The notification through the provided eventfd will be generated only when + * the device has entered and is resumed from a low power state after + * calling this device feature IOCTL. A device that has not entered low power + * state, as managed through the runtime power management core, will not + * generate a notification through the provided eventfd on access. Calling the + * LOW_POWER_EXIT feature is optional in the case where notification has been + * signaled on the provided eventfd that a resume from low power has occurred. + */ +struct vfio_device_low_power_entry_with_wakeup { + __s32 wakeup_eventfd; + __u32 reserved; +}; + +#define VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP 4 + +/* + * Upon VFIO_DEVICE_FEATURE_SET, disallow use of device low power states as + * previously enabled via VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY or + * VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP device features. + * This device feature IOCTL may itself generate a wakeup eventfd notification + * in the latter case if the device had previously entered a low power state. + */ +#define VFIO_DEVICE_FEATURE_LOW_POWER_EXIT 5 + /* -------- API for Type1 VFIO IOMMU -------- */ /** From 8e5c6995113d201addd651dc2db8e11c93ce639f Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:47 +0530 Subject: [PATCH 13/77] vfio: Increment the runtime PM usage count during IOCTL call The vfio-pci based drivers will have runtime power management support where the user can put the device into the low power state and then PCI devices can go into the D3cold state. If the device is in the low power state and the user issues any IOCTL, then the device should be moved out of the low power state first. Once the IOCTL is serviced, then it can go into the low power state again. The runtime PM framework manages this with help of usage count. One option was to add the runtime PM related API's inside vfio-pci driver but some IOCTL (like VFIO_DEVICE_FEATURE) can follow a different path and more IOCTL can be added in the future. Also, the runtime PM will be added for vfio-pci based drivers variant currently, but the other VFIO based drivers can use the same in the future. So, this patch adds the runtime calls runtime-related API in the top-level IOCTL function itself. For the VFIO drivers which do not have runtime power management support currently, the runtime PM API's won't be invoked. Only for vfio-pci based drivers currently, the runtime PM API's will be invoked to increment and decrement the usage count. In the vfio-pci drivers also, the variant drivers can opt-out by incrementing the usage count during device-open. The pm_runtime_resume_and_get() checks the device current status and will return early if the device is already in the ACTIVE state. Taking this usage count incremented while servicing IOCTL will make sure that the user won't put the device into the low power state when any other IOCTL is being serviced in parallel. Let's consider the following scenario: 1. Some other IOCTL is called. 2. The user has opened another device instance and called the IOCTL for low power entry. 3. The low power entry IOCTL moves the device into the low power state. 4. The other IOCTL finishes. If we don't keep the usage count incremented then the device access will happen between step 3 and 4 while the device has already gone into the low power state. The pm_runtime_resume_and_get() will be the first call so its error should not be propagated to user space directly. For example, if pm_runtime_resume_and_get() can return -EINVAL for the cases where the user has passed the correct argument. So the pm_runtime_resume_and_get() errors have been masked behind -EIO. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-3-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 52 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index eb714a484662..5edc49748013 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -1353,6 +1354,39 @@ static const struct file_operations vfio_group_fops = { .release = vfio_group_fops_release, }; +/* + * Wrapper around pm_runtime_resume_and_get(). + * Return error code on failure or 0 on success. + */ +static inline int vfio_device_pm_runtime_get(struct vfio_device *device) +{ + struct device *dev = device->dev; + + if (dev->driver && dev->driver->pm) { + int ret; + + ret = pm_runtime_resume_and_get(dev); + if (ret) { + dev_info_ratelimited(dev, + "vfio: runtime resume failed %d\n", ret); + return -EIO; + } + } + + return 0; +} + +/* + * Wrapper around pm_runtime_put(). + */ +static inline void vfio_device_pm_runtime_put(struct vfio_device *device) +{ + struct device *dev = device->dev; + + if (dev->driver && dev->driver->pm) + pm_runtime_put(dev); +} + /* * VFIO Device fd */ @@ -1673,15 +1707,27 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { struct vfio_device *device = filep->private_data; + int ret; + + ret = vfio_device_pm_runtime_get(device); + if (ret) + return ret; switch (cmd) { case VFIO_DEVICE_FEATURE: - return vfio_ioctl_device_feature(device, (void __user *)arg); + ret = vfio_ioctl_device_feature(device, (void __user *)arg); + break; + default: if (unlikely(!device->ops->ioctl)) - return -EINVAL; - return device->ops->ioctl(device, cmd, arg); + ret = -EINVAL; + else + ret = device->ops->ioctl(device, cmd, arg); + break; } + + vfio_device_pm_runtime_put(device); + return ret; } static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, From 4813724c4b76b62f8e6b60dd5655633d0db1c9a8 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:48 +0530 Subject: [PATCH 14/77] vfio/pci: Mask INTx during runtime suspend This patch adds INTx handling during runtime suspend/resume. All the suspend/resume related code for the user to put the device into the low power state will be added in subsequent patches. The INTx lines may be shared among devices. Whenever any INTx interrupt comes for the VFIO devices, then vfio_intx_handler() will be called for each device sharing the interrupt. Inside vfio_intx_handler(), it calls pci_check_and_mask_intx() and checks if the interrupt has been generated for the current device. Now, if the device is already in the D3cold state, then the config space can not be read. Attempt to read config space in D3cold state can cause system unresponsiveness in a few systems. To prevent this, mask INTx in runtime suspend callback, and unmask the same in runtime resume callback. If INTx has been already masked, then no handling is needed in runtime suspend/resume callbacks. 'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated to return true if the INTx vfio_pci_irq_ctx.masked value is changed inside this function. For the runtime suspend which is triggered for the no user of VFIO device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these callbacks won't do anything. The MSI/MSI-X are not shared so similar handling should not be needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal() without doing any device-specific config access. When the user performs any config access or IOCTL after receiving the eventfd notification, then the device will be moved to the D0 state first before servicing any request. Another option was to check this flag 'pm_intx_masked' inside vfio_intx_handler() instead of masking the interrupts. This flag is being set inside the runtime_suspend callback but the device can be in non-D3cold state (for example, if the user has disabled D3cold explicitly by sysfs, the D3cold is not supported in the platform, etc.). Also, in D3cold supported case, the device will be in D0 till the PCI core moves the device into D3cold. In this case, there is a possibility that the device can generate an interrupt. Adding check in the IRQ handler will not clear the IRQ status and the interrupt line will still be asserted. This can cause interrupt flooding. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 38 +++++++++++++++++++++++++++---- drivers/vfio/pci/vfio_pci_intrs.c | 6 ++++- drivers/vfio/pci/vfio_pci_priv.h | 2 +- include/linux/vfio_pci_core.h | 1 + 4 files changed, 41 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9273f1ffd0dd..207ede189c2a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -277,16 +277,46 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +#ifdef CONFIG_PM +static int vfio_pci_core_runtime_suspend(struct device *dev) +{ + struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + + /* + * If INTx is enabled, then mask INTx before going into the runtime + * suspended state and unmask the same in the runtime resume. + * If INTx has already been masked by the user, then + * vfio_pci_intx_mask() will return false and in that case, INTx + * should not be unmasked in the runtime resume. + */ + vdev->pm_intx_masked = ((vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) && + vfio_pci_intx_mask(vdev)); + + return 0; +} + +static int vfio_pci_core_runtime_resume(struct device *dev) +{ + struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + + if (vdev->pm_intx_masked) + vfio_pci_intx_unmask(vdev); + + return 0; +} +#endif /* CONFIG_PM */ + /* - * The dev_pm_ops needs to be provided to make pci-driver runtime PM working, - * so use structure without any callbacks. - * * The pci-driver core runtime PM routines always save the device state * before going into suspended state. If the device is going into low power * state with only with runtime PM ops, then no explicit handling is needed * for the devices which have NoSoftRst-. */ -static const struct dev_pm_ops vfio_pci_core_pm_ops = { }; +static const struct dev_pm_ops vfio_pci_core_pm_ops = { + SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend, + vfio_pci_core_runtime_resume, + NULL) +}; int vfio_pci_core_enable(struct vfio_pci_core_device *vdev) { diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 8cb987ef3c47..40c3d7cf163f 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -59,10 +59,12 @@ static void vfio_send_intx_eventfd(void *opaque, void *unused) eventfd_signal(vdev->ctx[0].trigger, 1); } -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) +/* Returns true if the INTx vfio_pci_irq_ctx.masked value is changed. */ +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; unsigned long flags; + bool masked_changed = false; spin_lock_irqsave(&vdev->irqlock, flags); @@ -86,9 +88,11 @@ void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev) disable_irq_nosync(pdev->irq); vdev->ctx[0].masked = true; + masked_changed = true; } spin_unlock_irqrestore(&vdev->irqlock, flags); + return masked_changed; } /* diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 58b8d34c162c..5e4fa69aee16 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -23,7 +23,7 @@ struct vfio_pci_ioeventfd { bool test_mem; }; -void vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); +bool vfio_pci_intx_mask(struct vfio_pci_core_device *vdev); void vfio_pci_intx_unmask(struct vfio_pci_core_device *vdev); int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index e5cf0d3313a6..a0f1f36e42a2 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -78,6 +78,7 @@ struct vfio_pci_core_device { bool needs_reset; bool nointx; bool needs_pm_restore; + bool pm_intx_masked; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; From cc2742fe3660cc6500021d3da8f937d326392dbd Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:49 +0530 Subject: [PATCH 15/77] vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT Currently, if the runtime power management is enabled for vfio-pci based devices in the guest OS, then the guest OS will do the register write for PCI_PM_CTRL register. This write request will be handled in vfio_pm_config_write() where it will do the actual register write of PCI_PM_CTRL register. With this, the maximum D3hot state can be achieved for low power. If we can use the runtime PM framework, then we can achieve the D3cold state (on the supported systems) which will help in saving maximum power. 1. D3cold state can't be achieved by writing PCI standard PM config registers. This patch implements the following newly added low power related device features: - VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY - VFIO_DEVICE_FEATURE_LOW_POWER_EXIT The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the device to make use of low power platform states on the host while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent further use of those power states. 2. The vfio-pci driver uses runtime PM framework for low power entry and exit. On the platforms where D3cold state is supported, the runtime PM framework will put the device into D3cold otherwise, D3hot or some other power state will be used. There are various cases where the device will not go into the runtime suspended state. For example, - The runtime power management is disabled on the host side for the device. - The user keeps the device busy after calling LOW_POWER_ENTRY. - There are dependent devices that are still in runtime active state. For these cases, the device will be in the same power state that has been configured by the user through PCI_PM_CTRL register. 3. The hypervisors can implement virtual ACPI methods. For example, in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power resources with _ON/_OFF method, then guest linux OS invokes the _OFF method during D3cold transition and then _ON during D0 transition. The hypervisor can tap these virtual ACPI calls and then call the low power device feature IOCTL. 4. The 'pm_runtime_engaged' flag tracks the entry and exit to runtime PM. This flag is protected with 'memory_lock' semaphore. 5. All the config and other region access are wrapped under pm_runtime_resume_and_get() and pm_runtime_put(). So, if any device access happens while the device is in the runtime suspended state, then the device will be resumed first before access. Once the access has been finished, then the device will again go into the runtime suspended state. 6. The memory region access through mmap will not be allowed in the low power state. Since __vfio_pci_memory_enabled() is a common function, so check for 'pm_runtime_engaged' has been added explicitly in vfio_pci_mmap_fault() to block only mmap'ed access. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 153 +++++++++++++++++++++++++++++-- include/linux/vfio_pci_core.h | 1 + 2 files changed, 146 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 207ede189c2a..9c612162653f 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -277,11 +277,100 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } +static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev) +{ + /* + * The vdev power related flags are protected with 'memory_lock' + * semaphore. + */ + vfio_pci_zap_and_down_write_memory_lock(vdev); + if (vdev->pm_runtime_engaged) { + up_write(&vdev->memory_lock); + return -EINVAL; + } + + vdev->pm_runtime_engaged = true; + pm_runtime_put_noidle(&vdev->pdev->dev); + up_write(&vdev->memory_lock); + + return 0; +} + +static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + /* + * Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count + * will be decremented. The pm_runtime_put() will be invoked again + * while returning from the ioctl and then the device can go into + * runtime suspended state. + */ + return vfio_pci_runtime_pm_entry(vdev); +} + +static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) +{ + if (vdev->pm_runtime_engaged) { + vdev->pm_runtime_engaged = false; + pm_runtime_get_noresume(&vdev->pdev->dev); + } +} + +static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) +{ + /* + * The vdev power related flags are protected with 'memory_lock' + * semaphore. + */ + down_write(&vdev->memory_lock); + __vfio_pci_runtime_pm_exit(vdev); + up_write(&vdev->memory_lock); +} + +static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, + void __user *arg, size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + /* + * The device is always in the active state here due to pm wrappers + * around ioctls. + */ + vfio_pci_runtime_pm_exit(vdev); + return 0; +} + #ifdef CONFIG_PM static int vfio_pci_core_runtime_suspend(struct device *dev) { struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + down_write(&vdev->memory_lock); + /* + * The user can move the device into D3hot state before invoking + * power management IOCTL. Move the device into D0 state here and then + * the pci-driver core runtime PM suspend function will move the device + * into the low power state. Also, for the devices which have + * NoSoftRst-, it will help in restoring the original state + * (saved locally in 'vdev->pm_save'). + */ + vfio_pci_set_power_state(vdev, PCI_D0); + up_write(&vdev->memory_lock); + /* * If INTx is enabled, then mask INTx before going into the runtime * suspended state and unmask the same in the runtime resume. @@ -418,6 +507,18 @@ void vfio_pci_core_disable(struct vfio_pci_core_device *vdev) /* * This function can be invoked while the power state is non-D0. + * This non-D0 power state can be with or without runtime PM. + * vfio_pci_runtime_pm_exit() will internally increment the usage + * count corresponding to pm_runtime_put() called during low power + * feature entry and then pm_runtime_resume() will wake up the device, + * if the device has already gone into the suspended state. Otherwise, + * the vfio_pci_set_power_state() will change the device power state + * to D0. + */ + vfio_pci_runtime_pm_exit(vdev); + pm_runtime_resume(&pdev->dev); + + /* * This function calls __pci_reset_function_locked() which internally * can use pci_pm_reset() for the function reset. pci_pm_reset() will * fail if the power state is non-D0. Also, for the devices which @@ -1273,6 +1374,10 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) { switch (flags & VFIO_DEVICE_FEATURE_MASK) { + case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: + return vfio_pci_core_pm_entry(device, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: + return vfio_pci_core_pm_exit(device, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: return vfio_pci_core_feature_token(device, flags, arg, argsz); default: @@ -1285,31 +1390,47 @@ static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + int ret; if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; + ret = pm_runtime_resume_and_get(&vdev->pdev->dev); + if (ret) { + pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n", + ret); + return -EIO; + } + switch (index) { case VFIO_PCI_CONFIG_REGION_INDEX: - return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + break; case VFIO_PCI_ROM_REGION_INDEX: if (iswrite) - return -EINVAL; - return vfio_pci_bar_rw(vdev, buf, count, ppos, false); + ret = -EINVAL; + else + ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false); + break; case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: - return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + break; case VFIO_PCI_VGA_REGION_INDEX: - return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite); + break; + default: index -= VFIO_PCI_NUM_REGIONS; - return vdev->region[index].ops->rw(vdev, buf, + ret = vdev->region[index].ops->rw(vdev, buf, count, ppos, iswrite); + break; } - return -EINVAL; + pm_runtime_put(&vdev->pdev->dev); + return ret; } ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf, @@ -1504,7 +1625,11 @@ static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf) mutex_lock(&vdev->vma_lock); down_read(&vdev->memory_lock); - if (!__vfio_pci_memory_enabled(vdev)) { + /* + * Memory region cannot be accessed if the low power feature is engaged + * or memory access is disabled. + */ + if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) { ret = VM_FAULT_SIGBUS; goto up_out; } @@ -2219,6 +2344,15 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_unlock; } + /* + * Some of the devices in the dev_set can be in the runtime suspended + * state. Increment the usage count for all the devices in the dev_set + * before reset and decrement the same after reset. + */ + ret = vfio_pci_dev_set_pm_runtime_get(dev_set); + if (ret) + goto err_unlock; + list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { /* * Test whether all the affected devices are contained by the @@ -2274,6 +2408,9 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, else mutex_unlock(&cur->vma_lock); } + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) + pm_runtime_put(&cur->pdev->dev); err_unlock: mutex_unlock(&dev_set->lock); return ret; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index a0f1f36e42a2..1025d53fde0b 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -79,6 +79,7 @@ struct vfio_pci_core_device { bool nointx; bool needs_pm_restore; bool pm_intx_masked; + bool pm_runtime_engaged; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; From 453e6c98fd2bdae0df9adbc86af8d8bf1164edd5 Mon Sep 17 00:00:00 2001 From: Abhishek Sahu Date: Mon, 29 Aug 2022 17:18:50 +0530 Subject: [PATCH 16/77] vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is any access for the VFIO device on the host side, then the device will be moved out of the low power state without the user's guest driver involvement. Once the device access has been finished, then the host can move the device again into low power state. With the low power entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP, the device will not be moved back into the low power state and a notification will be sent to the user by triggering wakeup eventfd. vfio_pci_core_pm_entry() will be called for both the variants of low power feature entry so add an extra argument for wakeup eventfd context and store locally in 'struct vfio_pci_core_device'. For the entry happened without wakeup eventfd, all the exit related handling will be done by the LOW_POWER_EXIT device feature only. When the LOW_POWER_EXIT will be called, then the vfio core layer vfio_device_pm_runtime_get() will increment the usage count and will resume the device. In the driver runtime_resume callback, the 'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit() will call vfio_pci_runtime_pm_exit() and all the exit related handling will be done. For the entry happened with wakeup eventfd, in the driver resume callback, eventfd will be triggered and all the exit related handling will be done. When vfio_pci_runtime_pm_exit() will be called by vfio_pci_core_pm_exit(), then it will return early. But if the runtime suspend has not happened on the host side, then all the exit related handling will be done in vfio_pci_core_pm_exit() only. Signed-off-by: Abhishek Sahu Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 63 ++++++++++++++++++++++++++++++-- include/linux/vfio_pci_core.h | 1 + 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 9c612162653f..0d4b49f06b14 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -277,7 +277,8 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat return ret; } -static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev) +static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev, + struct eventfd_ctx *efdctx) { /* * The vdev power related flags are protected with 'memory_lock' @@ -290,6 +291,7 @@ static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev) } vdev->pm_runtime_engaged = true; + vdev->pm_wake_eventfd_ctx = efdctx; pm_runtime_put_noidle(&vdev->pdev->dev); up_write(&vdev->memory_lock); @@ -313,7 +315,40 @@ static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags, * while returning from the ioctl and then the device can go into * runtime suspended state. */ - return vfio_pci_runtime_pm_entry(vdev); + return vfio_pci_runtime_pm_entry(vdev, NULL); +} + +static int vfio_pci_core_pm_entry_with_wakeup( + struct vfio_device *device, u32 flags, + struct vfio_device_low_power_entry_with_wakeup __user *arg, + size_t argsz) +{ + struct vfio_pci_core_device *vdev = + container_of(device, struct vfio_pci_core_device, vdev); + struct vfio_device_low_power_entry_with_wakeup entry; + struct eventfd_ctx *efdctx; + int ret; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, + sizeof(entry)); + if (ret != 1) + return ret; + + if (copy_from_user(&entry, arg, sizeof(entry))) + return -EFAULT; + + if (entry.wakeup_eventfd < 0) + return -EINVAL; + + efdctx = eventfd_ctx_fdget(entry.wakeup_eventfd); + if (IS_ERR(efdctx)) + return PTR_ERR(efdctx); + + ret = vfio_pci_runtime_pm_entry(vdev, efdctx); + if (ret) + eventfd_ctx_put(efdctx); + + return ret; } static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) @@ -321,6 +356,11 @@ static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev) if (vdev->pm_runtime_engaged) { vdev->pm_runtime_engaged = false; pm_runtime_get_noresume(&vdev->pdev->dev); + + if (vdev->pm_wake_eventfd_ctx) { + eventfd_ctx_put(vdev->pm_wake_eventfd_ctx); + vdev->pm_wake_eventfd_ctx = NULL; + } } } @@ -348,7 +388,10 @@ static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags, /* * The device is always in the active state here due to pm wrappers - * around ioctls. + * around ioctls. If the device had entered a low power state and + * pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has + * already signaled the eventfd and exited low power mode itself. + * pm_runtime_engaged protects the redundant call here. */ vfio_pci_runtime_pm_exit(vdev); return 0; @@ -388,6 +431,17 @@ static int vfio_pci_core_runtime_resume(struct device *dev) { struct vfio_pci_core_device *vdev = dev_get_drvdata(dev); + /* + * Resume with a pm_wake_eventfd_ctx signals the eventfd and exit + * low power mode. + */ + down_write(&vdev->memory_lock); + if (vdev->pm_wake_eventfd_ctx) { + eventfd_signal(vdev->pm_wake_eventfd_ctx, 1); + __vfio_pci_runtime_pm_exit(vdev); + } + up_write(&vdev->memory_lock); + if (vdev->pm_intx_masked) vfio_pci_intx_unmask(vdev); @@ -1376,6 +1430,9 @@ int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags, switch (flags & VFIO_DEVICE_FEATURE_MASK) { case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY: return vfio_pci_core_pm_entry(device, flags, arg, argsz); + case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP: + return vfio_pci_core_pm_entry_with_wakeup(device, flags, + arg, argsz); case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT: return vfio_pci_core_pm_exit(device, flags, arg, argsz); case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN: diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 1025d53fde0b..089b603bcfdc 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -85,6 +85,7 @@ struct vfio_pci_core_device { int ioeventfds_nr; struct eventfd_ctx *err_trigger; struct eventfd_ctx *req_trigger; + struct eventfd_ctx *pm_wake_eventfd_ctx; struct list_head dummy_resources_list; struct mutex ioeventfds_lock; struct list_head ioeventfds_list; From 21c13829bc3b786bc5336470df023ae54e41d230 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 16 Aug 2022 16:13:04 -0300 Subject: [PATCH 17/77] vfio: Remove vfio_group dev_counter This counts the number of devices attached to a vfio_group, ie the number of items in the group->device_list. It is only read in vfio_pin_pages(), as some kind of protection against limitations in type1. However, with all the code cleanups in this area, now that vfio_pin_pages() accepts a vfio_device directly it is redundant. All drivers are already calling vfio_register_emulated_iommu_dev() which directly creates a group specifically for the device and thus it is guaranteed that there is a singleton group. Leave a note in the comment about this requirement and remove the logic. Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v2-d4374a7bf0c9+c4-vfio_dev_counter_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 5edc49748013..77264d836d52 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -75,7 +75,6 @@ struct vfio_group { struct list_head vfio_next; struct list_head container_next; enum vfio_group_type type; - unsigned int dev_counter; struct rw_semaphore group_rwsem; struct kvm *kvm; struct file *opened_file; @@ -609,7 +608,6 @@ static int __vfio_register_dev(struct vfio_device *device, mutex_lock(&group->device_lock); list_add(&device->group_next, &group->device_list); - group->dev_counter++; mutex_unlock(&group->device_lock); return 0; @@ -697,7 +695,6 @@ void vfio_unregister_group_dev(struct vfio_device *device) mutex_lock(&group->device_lock); list_del(&device->group_next); - group->dev_counter--; mutex_unlock(&group->device_lock); if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) @@ -1991,6 +1988,9 @@ EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); * @prot [in] : protection flags * @pages[out] : array of host pages * Return error or number of pages pinned. + * + * A driver may only call this function if the vfio_device was created + * by vfio_register_emulated_iommu_dev(). */ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, int npage, int prot, struct page **pages) @@ -2006,9 +2006,6 @@ int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - if (group->dev_counter > 1) - return -EINVAL; - /* group->container cannot change while a vfio device is open */ container = group->container; driver = container->iommu_driver; From 245898eb9275ce31942cff95d0bdc7412ad3d589 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Wed, 31 Aug 2022 09:59:43 +0100 Subject: [PATCH 18/77] hisi_acc_vfio_pci: Correct the function prefix for hssi_acc_drvdata() Commit 91be0bd6c6cf("vfio/pci: Have all VFIO PCI drivers store the vfio_pci_core_device in drvdata") introduced a helper function to retrieve the drvdata but used "hssi" instead of "hisi" for the function prefix. Correct that and also while at it, moved the function a bit down so that it's close to other hisi_ prefixed functions. No functional changes. Signed-off-by: Shameer Kolothum Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220831085943.993-1-shameerali.kolothum.thodi@huawei.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index ea762e28c1cc..258cae0863ea 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -337,14 +337,6 @@ static int vf_qm_cache_wb(struct hisi_qm *qm) return 0; } -static struct hisi_acc_vf_core_device *hssi_acc_drvdata(struct pci_dev *pdev) -{ - struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); - - return container_of(core_device, struct hisi_acc_vf_core_device, - core_device); -} - static void vf_qm_fun_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_qm *qm) { @@ -552,6 +544,14 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } +static struct hisi_acc_vf_core_device *hisi_acc_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct hisi_acc_vf_core_device, + core_device); +} + /* Check the PF's RAS state and Function INT state */ static int hisi_acc_check_int_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) @@ -970,7 +970,7 @@ hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, static void hisi_acc_vf_pci_aer_reset_done(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); if (hisi_acc_vdev->core_device.vdev.migration_flags != VFIO_MIGRATION_STOP_COPY) @@ -1301,7 +1301,7 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = hssi_acc_drvdata(pdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); From eab60bbc05a9375145e7b793ca37a1b6ec262887 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 2 Sep 2022 18:07:54 +0200 Subject: [PATCH 19/77] vfio/fsl-mc: Fix a typo in a message L and S are swapped in the message. s/VFIO_FLS_MC/VFIO_FSL_MC/ Also use 'ret' instead of 'WARN_ON(ret)' to avoid a duplicated message. Signed-off-by: Christophe JAILLET Reviewed-by: Diana Craciun Acked-by: Cornelia Huck Link: https://lore.kernel.org/r/a7c1394346725b7435792628c8d4c06a0a745e0b.1662134821.git.christophe.jaillet@wanadoo.fr Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 3feff729f3ce..42b344bd7cd5 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -108,9 +108,9 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) /* reset the device before cleaning up the interrupts */ ret = vfio_fsl_mc_reset_device(vdev); - if (WARN_ON(ret)) + if (ret) dev_warn(&mc_cont->dev, - "VFIO_FLS_MC: reset device has failed (%d)\n", ret); + "VFIO_FSL_MC: reset device has failed (%d)\n", ret); vfio_fsl_mc_irqs_cleanup(vdev); From 42ee53f9bfd3e4cf58ae7656e0d11075f5fe8489 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:41 +0300 Subject: [PATCH 20/77] vfio: Introduce DMA logging uAPIs DMA logging allows a device to internally record what DMAs the device is initiating and report them back to userspace. It is part of the VFIO migration infrastructure that allows implementing dirty page tracking during the pre copy phase of live migration. Only DMA WRITEs are logged, and this API is not connected to VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. This patch introduces the DMA logging involved uAPIs. It uses the FEATURE ioctl with its GET/SET/PROBE options as of below. It exposes a PROBE option to detect if the device supports DMA logging. It exposes a SET option to start device DMA logging in given IOVAs ranges. It exposes a SET option to stop device DMA logging that was previously started. It exposes a GET option to read back and clear the device DMA log. Extra details exist as part of vfio.h per a specific option. Signed-off-by: Yishai Hadas Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220908183448.195262-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- include/uapi/linux/vfio.h | 86 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 76a173f973de..d7d8e0922376 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -1042,6 +1042,92 @@ struct vfio_device_low_power_entry_with_wakeup { */ #define VFIO_DEVICE_FEATURE_LOW_POWER_EXIT 5 +/* + * Upon VFIO_DEVICE_FEATURE_SET start/stop device DMA logging. + * VFIO_DEVICE_FEATURE_PROBE can be used to detect if the device supports + * DMA logging. + * + * DMA logging allows a device to internally record what DMAs the device is + * initiating and report them back to userspace. It is part of the VFIO + * migration infrastructure that allows implementing dirty page tracking + * during the pre copy phase of live migration. Only DMA WRITEs are logged, + * and this API is not connected to VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. + * + * When DMA logging is started a range of IOVAs to monitor is provided and the + * device can optimize its logging to cover only the IOVA range given. Each + * DMA that the device initiates inside the range will be logged by the device + * for later retrieval. + * + * page_size is an input that hints what tracking granularity the device + * should try to achieve. If the device cannot do the hinted page size then + * it's the driver choice which page size to pick based on its support. + * On output the device will return the page size it selected. + * + * ranges is a pointer to an array of + * struct vfio_device_feature_dma_logging_range. + * + * The core kernel code guarantees to support by minimum num_ranges that fit + * into a single kernel page. User space can try higher values but should give + * up if the above can't be achieved as of some driver limitations. + * + * A single call to start device DMA logging can be issued and a matching stop + * should follow at the end. Another start is not allowed in the meantime. + */ +struct vfio_device_feature_dma_logging_control { + __aligned_u64 page_size; + __u32 num_ranges; + __u32 __reserved; + __aligned_u64 ranges; +}; + +struct vfio_device_feature_dma_logging_range { + __aligned_u64 iova; + __aligned_u64 length; +}; + +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_START 6 + +/* + * Upon VFIO_DEVICE_FEATURE_SET stop device DMA logging that was started + * by VFIO_DEVICE_FEATURE_DMA_LOGGING_START + */ +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP 7 + +/* + * Upon VFIO_DEVICE_FEATURE_GET read back and clear the device DMA log + * + * Query the device's DMA log for written pages within the given IOVA range. + * During querying the log is cleared for the IOVA range. + * + * bitmap is a pointer to an array of u64s that will hold the output bitmap + * with 1 bit reporting a page_size unit of IOVA. The mapping of IOVA to bits + * is given by: + * bitmap[(addr - iova)/page_size] & (1ULL << (addr % 64)) + * + * The input page_size can be any power of two value and does not have to + * match the value given to VFIO_DEVICE_FEATURE_DMA_LOGGING_START. The driver + * will format its internal logging to match the reporting page size, possibly + * by replicating bits if the internal page size is lower than requested. + * + * The LOGGING_REPORT will only set bits in the bitmap and never clear or + * perform any initialization of the user provided bitmap. + * + * If any error is returned userspace should assume that the dirty log is + * corrupted. Error recovery is to consider all memory dirty and try to + * restart the dirty tracking, or to abort/restart the whole migration. + * + * If DMA logging is not enabled, an error will be returned. + * + */ +struct vfio_device_feature_dma_logging_report { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 bitmap; +}; + +#define VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT 8 + /* -------- API for Type1 VFIO IOMMU -------- */ /** From 58ccf0190d19d9a8a41f8a02b9e06742b58df4a1 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Thu, 8 Sep 2022 21:34:42 +0300 Subject: [PATCH 21/77] vfio: Add an IOVA bitmap support The new facility adds a bunch of wrappers that abstract how an IOVA range is represented in a bitmap that is granulated by a given page_size. So it translates all the lifting of dealing with user pointers into its corresponding kernel addresses backing said user memory into doing finally the (non-atomic) bitmap ops to change various bits. The formula for the bitmap is: data[(iova / page_size) / 64] & (1ULL << (iova % 64)) Where 64 is the number of bits in a unsigned long (depending on arch) It introduces an IOVA iterator that uses a windowing scheme to minimize the pinning overhead, as opposed to pinning it on demand 4K at a time. Assuming a 4K kernel page and 4K requested page size, we can use a single kernel page to hold 512 page pointers, mapping 2M of bitmap, representing 64G of IOVA space. An example usage of these helpers for a given @base_iova, @page_size, @length and __user @data: bitmap = iova_bitmap_alloc(base_iova, page_size, length, data); if (IS_ERR(bitmap)) return -ENOMEM; ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn); iova_bitmap_free(bitmap); Each iteration of the @dirty_reporter_fn is called with a unique @iova and @length argument, indicating the current range available through the iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty areas (@iova_length) within that provided range, as following: iova_bitmap_set(bitmap, iova, iova_length); The facility is intended to be used for user bitmaps representing dirtied IOVAs by IOMMU (via IOMMUFD) and PCI Devices (via vfio-pci). Signed-off-by: Joao Martins Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 6 +- drivers/vfio/iova_bitmap.c | 422 ++++++++++++++++++++++++++++++++++++ include/linux/iova_bitmap.h | 26 +++ 3 files changed, 452 insertions(+), 2 deletions(-) create mode 100644 drivers/vfio/iova_bitmap.c create mode 100644 include/linux/iova_bitmap.h diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 1a32357592e3..d67c604d0407 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,9 +1,11 @@ # SPDX-License-Identifier: GPL-2.0 vfio_virqfd-y := virqfd.o -vfio-y += vfio_main.o - obj-$(CONFIG_VFIO) += vfio.o + +vfio-y += vfio_main.o \ + iova_bitmap.o \ + obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o diff --git a/drivers/vfio/iova_bitmap.c b/drivers/vfio/iova_bitmap.c new file mode 100644 index 000000000000..6631e8befe1b --- /dev/null +++ b/drivers/vfio/iova_bitmap.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022, Oracle and/or its affiliates. + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ +#include +#include +#include + +#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE) + +/* + * struct iova_bitmap_map - A bitmap representing an IOVA range + * + * Main data structure for tracking mapped user pages of bitmap data. + * + * For example, for something recording dirty IOVAs, it will be provided a + * struct iova_bitmap structure, as a general structure for iterating the + * total IOVA range. The struct iova_bitmap_map, though, represents the + * subset of said IOVA space that is pinned by its parent structure (struct + * iova_bitmap). + * + * The user does not need to exact location of the bits in the bitmap. + * From user perspective the only API available is iova_bitmap_set() which + * records the IOVA *range* in the bitmap by setting the corresponding + * bits. + * + * The bitmap is an array of u64 whereas each bit represents an IOVA of + * range of (1 << pgshift). Thus formula for the bitmap data to be set is: + * + * data[(iova / page_size) / 64] & (1ULL << (iova % 64)) + */ +struct iova_bitmap_map { + /* base IOVA representing bit 0 of the first page */ + unsigned long iova; + + /* page size order that each bit granules to */ + unsigned long pgshift; + + /* page offset of the first user page pinned */ + unsigned long pgoff; + + /* number of pages pinned */ + unsigned long npages; + + /* pinned pages representing the bitmap data */ + struct page **pages; +}; + +/* + * struct iova_bitmap - The IOVA bitmap object + * + * Main data structure for iterating over the bitmap data. + * + * Abstracts the pinning work and iterates in IOVA ranges. + * It uses a windowing scheme and pins the bitmap in relatively + * big ranges e.g. + * + * The bitmap object uses one base page to store all the pinned pages + * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores + * 512 struct page pointers which, if the base page size is 4K, it means + * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is + * also 4K then the range window to iterate is 64G. + * + * For example iterating on a total IOVA range of 4G..128G, it will walk + * through this set of ranges: + * + * 4G - 68G-1 (64G) + * 68G - 128G-1 (64G) + * + * An example of the APIs on how to use/iterate over the IOVA bitmap: + * + * bitmap = iova_bitmap_alloc(iova, length, page_size, data); + * if (IS_ERR(bitmap)) + * return PTR_ERR(bitmap); + * + * ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn); + * + * iova_bitmap_free(bitmap); + * + * Each iteration of the @dirty_reporter_fn is called with a unique @iova + * and @length argument, indicating the current range available through the + * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty + * areas (@iova_length) within that provided range, as following: + * + * iova_bitmap_set(bitmap, iova, iova_length); + * + * The internals of the object uses an index @mapped_base_index that indexes + * which u64 word of the bitmap is mapped, up to @mapped_total_index. + * Those keep being incremented until @mapped_total_index is reached while + * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages. + * + * The IOVA bitmap is usually located on what tracks DMA mapped ranges or + * some form of IOVA range tracking that co-relates to the user passed + * bitmap. + */ +struct iova_bitmap { + /* IOVA range representing the currently mapped bitmap data */ + struct iova_bitmap_map mapped; + + /* userspace address of the bitmap */ + u64 __user *bitmap; + + /* u64 index that @mapped points to */ + unsigned long mapped_base_index; + + /* how many u64 can we walk in total */ + unsigned long mapped_total_index; + + /* base IOVA of the whole bitmap */ + unsigned long iova; + + /* length of the IOVA range for the whole bitmap */ + size_t length; +}; + +/* + * Converts a relative IOVA to a bitmap index. + * This function provides the index into the u64 array (bitmap::bitmap) + * for a given IOVA offset. + * Relative IOVA means relative to the bitmap::mapped base IOVA + * (stored in mapped::iova). All computations in this file are done using + * relative IOVAs and thus avoid an extra subtraction against mapped::iova. + * The user API iova_bitmap_set() always uses a regular absolute IOVAs. + */ +static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap, + unsigned long iova) +{ + unsigned long pgsize = 1 << bitmap->mapped.pgshift; + + return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize); +} + +/* + * Converts a bitmap index to a *relative* IOVA. + */ +static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap, + unsigned long index) +{ + unsigned long pgshift = bitmap->mapped.pgshift; + + return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift; +} + +/* + * Returns the base IOVA of the mapped range. + */ +static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap) +{ + unsigned long skip = bitmap->mapped_base_index; + + return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip); +} + +/* + * Pins the bitmap user pages for the current range window. + * This is internal to IOVA bitmap and called when advancing the + * index (@mapped_base_index) or allocating the bitmap. + */ +static int iova_bitmap_get(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + unsigned long npages; + u64 __user *addr; + long ret; + + /* + * @mapped_base_index is the index of the currently mapped u64 words + * that we have access. Anything before @mapped_base_index is not + * mapped. The range @mapped_base_index .. @mapped_total_index-1 is + * mapped but capped at a maximum number of pages. + */ + npages = DIV_ROUND_UP((bitmap->mapped_total_index - + bitmap->mapped_base_index) * + sizeof(*bitmap->bitmap), PAGE_SIZE); + + /* + * We always cap at max number of 'struct page' a base page can fit. + * This is, for example, on x86 means 2M of bitmap data max. + */ + npages = min(npages, PAGE_SIZE / sizeof(struct page *)); + + /* + * Bitmap address to be pinned is calculated via pointer arithmetic + * with bitmap u64 word index. + */ + addr = bitmap->bitmap + bitmap->mapped_base_index; + + ret = pin_user_pages_fast((unsigned long)addr, npages, + FOLL_WRITE, mapped->pages); + if (ret <= 0) + return -EFAULT; + + mapped->npages = (unsigned long)ret; + /* Base IOVA where @pages point to i.e. bit 0 of the first page */ + mapped->iova = iova_bitmap_mapped_iova(bitmap); + + /* + * offset of the page where pinned pages bit 0 is located. + * This handles the case where the bitmap is not PAGE_SIZE + * aligned. + */ + mapped->pgoff = offset_in_page(addr); + return 0; +} + +/* + * Unpins the bitmap user pages and clears @npages + * (un)pinning is abstracted from API user and it's done when advancing + * the index or freeing the bitmap. + */ +static void iova_bitmap_put(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + + if (mapped->npages) { + unpin_user_pages(mapped->pages, mapped->npages); + mapped->npages = 0; + } +} + +/** + * iova_bitmap_alloc() - Allocates an IOVA bitmap object + * @iova: Start address of the IOVA range + * @length: Length of the IOVA range + * @page_size: Page size of the IOVA bitmap. It defines what each bit + * granularity represents + * @data: Userspace address of the bitmap + * + * Allocates an IOVA object and initializes all its fields including the + * first user pages of @data. + * + * Return: A pointer to a newly allocated struct iova_bitmap + * or ERR_PTR() on error. + */ +struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, + unsigned long page_size, u64 __user *data) +{ + struct iova_bitmap_map *mapped; + struct iova_bitmap *bitmap; + int rc; + + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); + if (!bitmap) + return ERR_PTR(-ENOMEM); + + mapped = &bitmap->mapped; + mapped->pgshift = __ffs(page_size); + bitmap->bitmap = data; + bitmap->mapped_total_index = + iova_bitmap_offset_to_index(bitmap, length - 1) + 1; + bitmap->iova = iova; + bitmap->length = length; + mapped->iova = iova; + mapped->pages = (struct page **)__get_free_page(GFP_KERNEL); + if (!mapped->pages) { + rc = -ENOMEM; + goto err; + } + + rc = iova_bitmap_get(bitmap); + if (rc) + goto err; + return bitmap; + +err: + iova_bitmap_free(bitmap); + return ERR_PTR(rc); +} + +/** + * iova_bitmap_free() - Frees an IOVA bitmap object + * @bitmap: IOVA bitmap to free + * + * It unpins and releases pages array memory and clears any leftover + * state. + */ +void iova_bitmap_free(struct iova_bitmap *bitmap) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + + iova_bitmap_put(bitmap); + + if (mapped->pages) { + free_page((unsigned long)mapped->pages); + mapped->pages = NULL; + } + + kfree(bitmap); +} + +/* + * Returns the remaining bitmap indexes from mapped_total_index to process for + * the currently pinned bitmap pages. + */ +static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap) +{ + unsigned long remaining; + + remaining = bitmap->mapped_total_index - bitmap->mapped_base_index; + remaining = min_t(unsigned long, remaining, + (bitmap->mapped.npages << PAGE_SHIFT) / sizeof(*bitmap->bitmap)); + + return remaining; +} + +/* + * Returns the length of the mapped IOVA range. + */ +static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap) +{ + unsigned long max_iova = bitmap->iova + bitmap->length - 1; + unsigned long iova = iova_bitmap_mapped_iova(bitmap); + unsigned long remaining; + + /* + * iova_bitmap_mapped_remaining() returns a number of indexes which + * when converted to IOVA gives us a max length that the bitmap + * pinned data can cover. Afterwards, that is capped to + * only cover the IOVA range in @bitmap::iova .. @bitmap::length. + */ + remaining = iova_bitmap_index_to_offset(bitmap, + iova_bitmap_mapped_remaining(bitmap)); + + if (iova + remaining - 1 > max_iova) + remaining -= ((iova + remaining - 1) - max_iova); + + return remaining; +} + +/* + * Returns true if there's not more data to iterate. + */ +static bool iova_bitmap_done(struct iova_bitmap *bitmap) +{ + return bitmap->mapped_base_index >= bitmap->mapped_total_index; +} + +/* + * Advances to the next range, releases the current pinned + * pages and pins the next set of bitmap pages. + * Returns 0 on success or otherwise errno. + */ +static int iova_bitmap_advance(struct iova_bitmap *bitmap) +{ + unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1; + unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1; + + bitmap->mapped_base_index += count; + + iova_bitmap_put(bitmap); + if (iova_bitmap_done(bitmap)) + return 0; + + /* When advancing the index we pin the next set of bitmap pages */ + return iova_bitmap_get(bitmap); +} + +/** + * iova_bitmap_for_each() - Iterates over the bitmap + * @bitmap: IOVA bitmap to iterate + * @opaque: Additional argument to pass to the callback + * @fn: Function that gets called for each IOVA range + * + * Helper function to iterate over bitmap data representing a portion of IOVA + * space. It hides the complexity of iterating bitmaps and translating the + * mapped bitmap user pages into IOVA ranges to process. + * + * Return: 0 on success, and an error on failure either upon + * iteration or when the callback returns an error. + */ +int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn) +{ + int ret = 0; + + for (; !iova_bitmap_done(bitmap) && !ret; + ret = iova_bitmap_advance(bitmap)) { + ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap), + iova_bitmap_mapped_length(bitmap), opaque); + if (ret) + break; + } + + return ret; +} + +/** + * iova_bitmap_set() - Records an IOVA range in bitmap + * @bitmap: IOVA bitmap + * @iova: IOVA to start + * @length: IOVA range length + * + * Set the bits corresponding to the range [iova .. iova+length-1] in + * the user bitmap. + * + * Return: The number of bits set. + */ +void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length) +{ + struct iova_bitmap_map *mapped = &bitmap->mapped; + unsigned long offset = (iova - mapped->iova) >> mapped->pgshift; + unsigned long nbits = max_t(unsigned long, 1, length >> mapped->pgshift); + unsigned long page_idx = offset / BITS_PER_PAGE; + unsigned long page_offset = mapped->pgoff; + void *kaddr; + + offset = offset % BITS_PER_PAGE; + + do { + unsigned long size = min(BITS_PER_PAGE - offset, nbits); + + kaddr = kmap_local_page(mapped->pages[page_idx]); + bitmap_set(kaddr + page_offset, offset, size); + kunmap_local(kaddr); + page_offset = offset = 0; + nbits -= size; + page_idx++; + } while (nbits > 0); +} +EXPORT_SYMBOL_GPL(iova_bitmap_set); diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h new file mode 100644 index 000000000000..c006cf0a25f3 --- /dev/null +++ b/include/linux/iova_bitmap.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2022, Oracle and/or its affiliates. + * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ +#ifndef _IOVA_BITMAP_H_ +#define _IOVA_BITMAP_H_ + +#include + +struct iova_bitmap; + +typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap, + unsigned long iova, size_t length, + void *opaque); + +struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, + unsigned long page_size, + u64 __user *data); +void iova_bitmap_free(struct iova_bitmap *bitmap); +int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn); +void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length); + +#endif From 80c4b92a2dc48cce82a0348add48533db7e07314 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:43 +0300 Subject: [PATCH 22/77] vfio: Introduce the DMA logging feature support Introduce the DMA logging feature support in the vfio core layer. It includes the processing of the device start/stop/report DMA logging UAPIs and calling the relevant driver 'op' to do the work. Specifically, Upon start, the core translates the given input ranges into an interval tree, checks for unexpected overlapping, non aligned ranges and then pass the translated input to the driver for start tracking the given ranges. Upon report, the core translates the given input user space bitmap and page size into an IOVA kernel bitmap iterator. Then it iterates it and call the driver to set the corresponding bits for the dirtied pages in a specific IOVA range. Upon stop, the driver is called to stop the previous started tracking. The next patches from the series will introduce the mlx5 driver implementation for the logging ops. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Kconfig | 1 + drivers/vfio/pci/vfio_pci_core.c | 5 + drivers/vfio/vfio_main.c | 175 +++++++++++++++++++++++++++++++ include/linux/vfio.h | 28 ++++- 4 files changed, 207 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 6130d00252ed..86c381ceb9a1 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -3,6 +3,7 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" select IOMMU_API select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + select INTERVAL_TREE help VFIO provides a framework for secure userspace device drivers. See Documentation/driver-api/vfio.rst for more details. diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 0d4b49f06b14..0a801aee2f2d 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2128,6 +2128,11 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) return -EINVAL; } + if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start && + vdev->vdev.log_ops->log_stop && + vdev->vdev.log_ops->log_read_and_clear)) + return -EINVAL; + /* * Prevent binding to PFs with VFs enabled, the VFs might be in use * by the host or other users. We cannot capture the VFs if they diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 77264d836d52..27d9186f35d5 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -1658,6 +1660,167 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device, return 0; } +/* Ranges should fit into a single kernel page */ +#define LOG_MAX_RANGES \ + (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) + +static int +vfio_ioctl_device_feature_logging_start(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + size_t minsz = + offsetofend(struct vfio_device_feature_dma_logging_control, + ranges); + struct vfio_device_feature_dma_logging_range __user *ranges; + struct vfio_device_feature_dma_logging_control control; + struct vfio_device_feature_dma_logging_range range; + struct rb_root_cached root = RB_ROOT_CACHED; + struct interval_tree_node *nodes; + u64 iova_end; + u32 nnodes; + int i, ret; + + if (!device->log_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_SET, + sizeof(control)); + if (ret != 1) + return ret; + + if (copy_from_user(&control, arg, minsz)) + return -EFAULT; + + nnodes = control.num_ranges; + if (!nnodes) + return -EINVAL; + + if (nnodes > LOG_MAX_RANGES) + return -E2BIG; + + ranges = u64_to_user_ptr(control.ranges); + nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node), + GFP_KERNEL); + if (!nodes) + return -ENOMEM; + + for (i = 0; i < nnodes; i++) { + if (copy_from_user(&range, &ranges[i], sizeof(range))) { + ret = -EFAULT; + goto end; + } + if (!IS_ALIGNED(range.iova, control.page_size) || + !IS_ALIGNED(range.length, control.page_size)) { + ret = -EINVAL; + goto end; + } + + if (check_add_overflow(range.iova, range.length, &iova_end) || + iova_end > ULONG_MAX) { + ret = -EOVERFLOW; + goto end; + } + + nodes[i].start = range.iova; + nodes[i].last = range.iova + range.length - 1; + if (interval_tree_iter_first(&root, nodes[i].start, + nodes[i].last)) { + /* Range overlapping */ + ret = -EINVAL; + goto end; + } + interval_tree_insert(nodes + i, &root); + } + + ret = device->log_ops->log_start(device, &root, nnodes, + &control.page_size); + if (ret) + goto end; + + if (copy_to_user(arg, &control, sizeof(control))) { + ret = -EFAULT; + device->log_ops->log_stop(device); + } + +end: + kfree(nodes); + return ret; +} + +static int +vfio_ioctl_device_feature_logging_stop(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + int ret; + + if (!device->log_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + return device->log_ops->log_stop(device); +} + +static int vfio_device_log_read_and_clear(struct iova_bitmap *iter, + unsigned long iova, size_t length, + void *opaque) +{ + struct vfio_device *device = opaque; + + return device->log_ops->log_read_and_clear(device, iova, length, iter); +} + +static int +vfio_ioctl_device_feature_logging_report(struct vfio_device *device, + u32 flags, void __user *arg, + size_t argsz) +{ + size_t minsz = + offsetofend(struct vfio_device_feature_dma_logging_report, + bitmap); + struct vfio_device_feature_dma_logging_report report; + struct iova_bitmap *iter; + u64 iova_end; + int ret; + + if (!device->log_ops) + return -ENOTTY; + + ret = vfio_check_feature(flags, argsz, + VFIO_DEVICE_FEATURE_GET, + sizeof(report)); + if (ret != 1) + return ret; + + if (copy_from_user(&report, arg, minsz)) + return -EFAULT; + + if (report.page_size < SZ_4K || !is_power_of_2(report.page_size)) + return -EINVAL; + + if (check_add_overflow(report.iova, report.length, &iova_end) || + iova_end > ULONG_MAX) + return -EOVERFLOW; + + iter = iova_bitmap_alloc(report.iova, report.length, + report.page_size, + u64_to_user_ptr(report.bitmap)); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + ret = iova_bitmap_for_each(iter, device, + vfio_device_log_read_and_clear); + + iova_bitmap_free(iter); + return ret; +} + static int vfio_ioctl_device_feature(struct vfio_device *device, struct vfio_device_feature __user *arg) { @@ -1691,6 +1854,18 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, return vfio_ioctl_device_feature_mig_device_state( device, feature.flags, arg->data, feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_DMA_LOGGING_START: + return vfio_ioctl_device_feature_logging_start( + device, feature.flags, arg->data, + feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP: + return vfio_ioctl_device_feature_logging_stop( + device, feature.flags, arg->data, + feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT: + return vfio_ioctl_device_feature_logging_report( + device, feature.flags, arg->data, + feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) return -EINVAL; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e05ddc6fe6a5..0e2826559091 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -14,6 +14,7 @@ #include #include #include +#include struct kvm; @@ -33,10 +34,11 @@ struct vfio_device { struct device *dev; const struct vfio_device_ops *ops; /* - * mig_ops is a static property of the vfio_device which must be set - * prior to registering the vfio_device. + * mig_ops/log_ops is a static property of the vfio_device which must + * be set prior to registering the vfio_device. */ const struct vfio_migration_ops *mig_ops; + const struct vfio_log_ops *log_ops; struct vfio_group *group; struct vfio_device_set *dev_set; struct list_head dev_set_list; @@ -108,6 +110,28 @@ struct vfio_migration_ops { enum vfio_device_mig_state *curr_state); }; +/** + * @log_start: Optional callback to ask the device start DMA logging. + * @log_stop: Optional callback to ask the device stop DMA logging. + * @log_read_and_clear: Optional callback to ask the device read + * and clear the dirty DMAs in some given range. + * + * The vfio core implementation of the DEVICE_FEATURE_DMA_LOGGING_ set + * of features does not track logging state relative to the device, + * therefore the device implementation of vfio_log_ops must handle + * arbitrary user requests. This includes rejecting subsequent calls + * to log_start without an intervening log_stop, as well as graceful + * handling of log_stop and log_read_and_clear from invalid states. + */ +struct vfio_log_ops { + int (*log_start)(struct vfio_device *device, + struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); + int (*log_stop)(struct vfio_device *device); + int (*log_read_and_clear)(struct vfio_device *device, + unsigned long iova, unsigned long length, + struct iova_bitmap *dirty); +}; + /** * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl * @flags: Arg from the device_feature op From 79c3cf279926f8db0a606a479944a131e27a39ea Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:44 +0300 Subject: [PATCH 23/77] vfio/mlx5: Init QP based resources for dirty tracking Init QP based resources for dirty tracking to be used upon start logging. It includes: Creating the host and firmware RC QPs, move each of them to its expected state based on the device specification, etc. Creating the relevant resources which are needed by both QPs as of UAR, PD, etc. Creating the host receive side resources as of MKEY, CQ, receive WQEs, etc. The above resources are cleaned-up upon stop logging. The tracker object that will be introduced by next patches will use those resources. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 595 +++++++++++++++++++++++++++++++++++- drivers/vfio/pci/mlx5/cmd.h | 53 ++++ 2 files changed, 636 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index dd5d7bfe0a49..0a362796d567 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -7,6 +7,8 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); +static void +_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) { @@ -72,19 +74,22 @@ static int mlx5fv_vf_event(struct notifier_block *nb, struct mlx5vf_pci_core_device *mvdev = container_of(nb, struct mlx5vf_pci_core_device, nb); - mutex_lock(&mvdev->state_mutex); switch (event) { case MLX5_PF_NOTIFY_ENABLE_VF: + mutex_lock(&mvdev->state_mutex); mvdev->mdev_detach = false; + mlx5vf_state_mutex_unlock(mvdev); break; case MLX5_PF_NOTIFY_DISABLE_VF: - mlx5vf_disable_fds(mvdev); + mlx5vf_cmd_close_migratable(mvdev); + mutex_lock(&mvdev->state_mutex); mvdev->mdev_detach = true; + mlx5vf_state_mutex_unlock(mvdev); break; default: break; } - mlx5vf_state_mutex_unlock(mvdev); + return 0; } @@ -95,6 +100,7 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) mutex_lock(&mvdev->state_mutex); mlx5vf_disable_fds(mvdev); + _mlx5vf_free_page_tracker_resources(mvdev); mlx5vf_state_mutex_unlock(mvdev); } @@ -188,11 +194,13 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, return ret; } -static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vf_migration_file *migf, u32 *mkey) +static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, + struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_recv_buf *recv_buf, + u32 *mkey) { - size_t npages = DIV_ROUND_UP(migf->total_length, PAGE_SIZE); - struct sg_dma_page_iter dma_iter; + size_t npages = migf ? DIV_ROUND_UP(migf->total_length, PAGE_SIZE) : + recv_buf->npages; int err = 0, inlen; __be64 *mtt; void *mkc; @@ -209,8 +217,17 @@ static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, DIV_ROUND_UP(npages, 2)); mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + if (migf) { + struct sg_dma_page_iter dma_iter; + + for_each_sgtable_dma_page(&migf->table.sgt, &dma_iter, 0) + *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + } else { + int i; + + for (i = 0; i < npages; i++) + *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); + } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); @@ -223,7 +240,8 @@ static int _create_state_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, qpn, 0xffffff); MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); - MLX5_SET64(mkc, mkc, len, migf->total_length); + MLX5_SET64(mkc, mkc, len, + migf ? migf->total_length : (npages * PAGE_SIZE)); err = mlx5_core_create_mkey(mdev, mkey, in, inlen); kvfree(in); return err; @@ -297,7 +315,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) goto err_dma_map; - err = _create_state_mkey(mdev, pdn, migf, &mkey); + err = _create_mkey(mdev, pdn, migf, NULL, &mkey); if (err) goto err_create_mkey; @@ -369,7 +387,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (err) goto err_reg; - err = _create_state_mkey(mdev, pdn, migf, &mkey); + err = _create_mkey(mdev, pdn, migf, NULL, &mkey); if (err) goto err_mkey; @@ -391,3 +409,556 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, mutex_unlock(&migf->lock); return err; } + +static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, + struct mlx5_vhca_cq_buf *buf, int nent, + int cqe_size) +{ + struct mlx5_frag_buf *frag_buf = &buf->frag_buf; + u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0); + u8 log_wq_sz = ilog2(cqe_size); + int err; + + err = mlx5_frag_buf_alloc_node(mdev, nent * cqe_size, frag_buf, + mdev->priv.numa_node); + if (err) + return err; + + mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc); + buf->cqe_size = cqe_size; + buf->nent = nent; + return 0; +} + +static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf) +{ + struct mlx5_cqe64 *cqe64; + void *cqe; + int i; + + for (i = 0; i < buf->nent; i++) { + cqe = mlx5_frag_buf_get_wqe(&buf->fbc, i); + cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64; + cqe64->op_own = MLX5_CQE_INVALID << 4; + } +} + +static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, + struct mlx5_vhca_cq *cq) +{ + mlx5_core_destroy_cq(mdev, &cq->mcq); + mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); + mlx5_db_free(mdev, &cq->db); +} + +static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, + struct mlx5_vhca_page_tracker *tracker, + size_t ncqe) +{ + int cqe_size = cache_line_size() == 128 ? 128 : 64; + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_vhca_cq *cq; + int inlen, err, eqn; + void *cqc, *in; + __be64 *pas; + int vector; + + cq = &tracker->cq; + ncqe = roundup_pow_of_two(ncqe); + err = mlx5_db_alloc_node(mdev, &cq->db, mdev->priv.numa_node); + if (err) + return err; + + cq->ncqe = ncqe; + cq->mcq.set_ci_db = cq->db.db; + cq->mcq.arm_db = cq->db.db + 1; + cq->mcq.cqe_sz = cqe_size; + err = alloc_cq_frag_buf(mdev, &cq->buf, ncqe, cqe_size); + if (err) + goto err_db_free; + + init_cq_frag_buf(&cq->buf); + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * + cq->buf.frag_buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_buff; + } + + vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); + err = mlx5_vector2eqn(mdev, vector, &eqn); + if (err) + goto err_vec; + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe)); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, tracker->uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); + pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); + mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); + err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); + if (err) + goto err_vec; + + kvfree(in); + return 0; + +err_vec: + kvfree(in); +err_buff: + mlx5_frag_buf_free(mdev, &cq->buf.frag_buf); +err_db_free: + mlx5_db_free(mdev, &cq->db); + return err; +} + +static struct mlx5_vhca_qp * +mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, + struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr) +{ + u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {}; + struct mlx5_vhca_qp *qp; + u8 log_rq_stride; + u8 log_rq_sz; + void *qpc; + int inlen; + void *in; + int err; + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); + log_rq_stride = ilog2(MLX5_SEND_WQE_DS); + log_rq_sz = ilog2(qp->rq.wqe_cnt); + err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); + if (err) + goto err_free; + + if (max_recv_wr) { + err = mlx5_frag_buf_alloc_node(mdev, + wq_get_byte_sz(log_rq_sz, log_rq_stride), + &qp->buf, mdev->priv.numa_node); + if (err) + goto err_db_free; + mlx5_init_fbc(qp->buf.frags, log_rq_stride, log_rq_sz, &qp->rq.fbc); + } + + qp->rq.db = &qp->db.db[MLX5_RCV_DBR]; + inlen = MLX5_ST_SZ_BYTES(create_qp_in) + + MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) * + qp->buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_in; + } + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); + MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); + MLX5_SET(qpc, qpc, pd, tracker->pdn); + MLX5_SET(qpc, qpc, uar_page, tracker->uar->index); + MLX5_SET(qpc, qpc, log_page_size, + qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev)); + if (MLX5_CAP_GEN(mdev, cqe_version) == 1) + MLX5_SET(qpc, qpc, user_index, 0xFFFFFF); + MLX5_SET(qpc, qpc, no_sq, 1); + if (max_recv_wr) { + MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn); + MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4); + MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz); + MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ); + MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma); + mlx5_fill_page_frag_array(&qp->buf, + (__be64 *)MLX5_ADDR_OF(create_qp_in, + in, pas)); + } else { + MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ); + } + + MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + kvfree(in); + if (err) + goto err_in; + + qp->qpn = MLX5_GET(create_qp_out, out, qpn); + return qp; + +err_in: + if (max_recv_wr) + mlx5_frag_buf_free(mdev, &qp->buf); +err_db_free: + mlx5_db_free(mdev, &qp->db); +err_free: + kfree(qp); + return ERR_PTR(err); +} + +static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp) +{ + struct mlx5_wqe_data_seg *data; + unsigned int ix; + + WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt); + ix = qp->rq.pc & (qp->rq.wqe_cnt - 1); + data = mlx5_frag_buf_get_wqe(&qp->rq.fbc, ix); + data->byte_count = cpu_to_be32(qp->max_msg_size); + data->lkey = cpu_to_be32(qp->recv_buf.mkey); + data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset); + qp->rq.pc++; + /* Make sure that descriptors are written before doorbell record. */ + dma_wmb(); + *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff); +} + +static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp, u32 remote_qpn, + bool host_qp) +{ + u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {}; + u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {}; + u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {}; + void *qpc; + int ret; + + /* Init */ + qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED); + MLX5_SET(qpc, qpc, rre, 1); + MLX5_SET(qpc, qpc, rwe, 1); + MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP); + MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn); + ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in); + if (ret) + return ret; + + if (host_qp) { + struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; + int i; + + for (i = 0; i < qp->rq.wqe_cnt; i++) { + mlx5vf_post_recv(qp); + recv_buf->next_rq_offset += qp->max_msg_size; + } + } + + /* RTR */ + qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc); + MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); + MLX5_SET(qpc, qpc, mtu, IB_MTU_4096); + MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg)); + MLX5_SET(qpc, qpc, remote_qpn, remote_qpn); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1); + MLX5_SET(qpc, qpc, primary_address_path.fl, 1); + MLX5_SET(qpc, qpc, min_rnr_nak, 1); + MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP); + MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn); + ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in); + if (ret || host_qp) + return ret; + + /* RTS */ + qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc); + MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); + MLX5_SET(qpc, qpc, retry_count, 7); + MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */ + MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */ + MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP); + MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn); + + return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in); +} + +static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp) +{ + u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {}; + + MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); + MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); + mlx5_cmd_exec_in(mdev, destroy_qp, in); + + mlx5_frag_buf_free(mdev, &qp->buf); + mlx5_db_free(mdev, &qp->db); + kfree(qp); +} + +static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) +{ + int i; + + /* Undo alloc_pages_bulk_array() */ + for (i = 0; i < recv_buf->npages; i++) + __free_page(recv_buf->page_list[i]); + + kvfree(recv_buf->page_list); +} + +static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, + unsigned int npages) +{ + unsigned int filled = 0, done = 0; + int i; + + recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), + GFP_KERNEL); + if (!recv_buf->page_list) + return -ENOMEM; + + for (;;) { + filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done, + recv_buf->page_list + done); + if (!filled) + goto err; + + done += filled; + if (done == npages) + break; + } + + recv_buf->npages = npages; + return 0; + +err: + for (i = 0; i < npages; i++) { + if (recv_buf->page_list[i]) + __free_page(recv_buf->page_list[i]); + } + + kvfree(recv_buf->page_list); + return -ENOMEM; +} + +static int register_dma_recv_pages(struct mlx5_core_dev *mdev, + struct mlx5_vhca_recv_buf *recv_buf) +{ + int i, j; + + recv_buf->dma_addrs = kvcalloc(recv_buf->npages, + sizeof(*recv_buf->dma_addrs), + GFP_KERNEL); + if (!recv_buf->dma_addrs) + return -ENOMEM; + + for (i = 0; i < recv_buf->npages; i++) { + recv_buf->dma_addrs[i] = dma_map_page(mdev->device, + recv_buf->page_list[i], + 0, PAGE_SIZE, + DMA_FROM_DEVICE); + if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) + goto error; + } + return 0; + +error: + for (j = 0; j < i; j++) + dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], + PAGE_SIZE, DMA_FROM_DEVICE); + + kvfree(recv_buf->dma_addrs); + return -ENOMEM; +} + +static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, + struct mlx5_vhca_recv_buf *recv_buf) +{ + int i; + + for (i = 0; i < recv_buf->npages; i++) + dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], + PAGE_SIZE, DMA_FROM_DEVICE); + + kvfree(recv_buf->dma_addrs); +} + +static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp) +{ + struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; + + mlx5_core_destroy_mkey(mdev, recv_buf->mkey); + unregister_dma_recv_pages(mdev, recv_buf); + free_recv_pages(&qp->recv_buf); +} + +static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, + struct mlx5_vhca_qp *qp, u32 pdn, + u64 rq_size) +{ + unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE); + struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; + int err; + + err = alloc_recv_pages(recv_buf, npages); + if (err < 0) + return err; + + err = register_dma_recv_pages(mdev, recv_buf); + if (err) + goto end; + + err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); + if (err) + goto err_create_mkey; + + return 0; + +err_create_mkey: + unregister_dma_recv_pages(mdev, recv_buf); +end: + free_recv_pages(recv_buf); + return err; +} + +static void +_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) +{ + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + struct mlx5_core_dev *mdev = mvdev->mdev; + + lockdep_assert_held(&mvdev->state_mutex); + + if (!mvdev->log_active) + return; + + WARN_ON(mvdev->mdev_detach); + + mlx5vf_destroy_qp(mdev, tracker->fw_qp); + mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); + mlx5vf_destroy_qp(mdev, tracker->host_qp); + mlx5vf_destroy_cq(mdev, &tracker->cq); + mlx5_core_dealloc_pd(mdev, tracker->pdn); + mlx5_put_uars_page(mdev, tracker->uar); + mvdev->log_active = false; +} + +int mlx5vf_stop_page_tracker(struct vfio_device *vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + + mutex_lock(&mvdev->state_mutex); + if (!mvdev->log_active) + goto end; + + _mlx5vf_free_page_tracker_resources(mvdev); + mvdev->log_active = false; +end: + mlx5vf_state_mutex_unlock(mvdev); + return 0; +} + +int mlx5vf_start_page_tracker(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + u8 log_tracked_page = ilog2(*page_size); + struct mlx5_vhca_qp *host_qp; + struct mlx5_vhca_qp *fw_qp; + struct mlx5_core_dev *mdev; + u32 max_msg_size = PAGE_SIZE; + u64 rq_size = SZ_2M; + u32 max_recv_wr; + int err; + + mutex_lock(&mvdev->state_mutex); + if (mvdev->mdev_detach) { + err = -ENOTCONN; + goto end; + } + + if (mvdev->log_active) { + err = -EINVAL; + goto end; + } + + mdev = mvdev->mdev; + memset(tracker, 0, sizeof(*tracker)); + tracker->uar = mlx5_get_uars_page(mdev); + if (IS_ERR(tracker->uar)) { + err = PTR_ERR(tracker->uar); + goto end; + } + + err = mlx5_core_alloc_pd(mdev, &tracker->pdn); + if (err) + goto err_uar; + + max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size); + err = mlx5vf_create_cq(mdev, tracker, max_recv_wr); + if (err) + goto err_dealloc_pd; + + host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr); + if (IS_ERR(host_qp)) { + err = PTR_ERR(host_qp); + goto err_cq; + } + + host_qp->max_msg_size = max_msg_size; + if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_min_page_size)) { + log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_min_page_size); + } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_max_page_size)) { + log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev, + pg_track_log_max_page_size); + } + + host_qp->tracked_page_size = (1ULL << log_tracked_page); + err = mlx5vf_alloc_qp_recv_resources(mdev, host_qp, tracker->pdn, + rq_size); + if (err) + goto err_host_qp; + + fw_qp = mlx5vf_create_rc_qp(mdev, tracker, 0); + if (IS_ERR(fw_qp)) { + err = PTR_ERR(fw_qp); + goto err_recv_resources; + } + + err = mlx5vf_activate_qp(mdev, host_qp, fw_qp->qpn, true); + if (err) + goto err_activate; + + err = mlx5vf_activate_qp(mdev, fw_qp, host_qp->qpn, false); + if (err) + goto err_activate; + + tracker->host_qp = host_qp; + tracker->fw_qp = fw_qp; + *page_size = host_qp->tracked_page_size; + mvdev->log_active = true; + mlx5vf_state_mutex_unlock(mvdev); + return 0; + +err_activate: + mlx5vf_destroy_qp(mdev, fw_qp); +err_recv_resources: + mlx5vf_free_qp_recv_resources(mdev, host_qp); +err_host_qp: + mlx5vf_destroy_qp(mdev, host_qp); +err_cq: + mlx5vf_destroy_cq(mdev, &tracker->cq); +err_dealloc_pd: + mlx5_core_dealloc_pd(mdev, tracker->pdn); +err_uar: + mlx5_put_uars_page(mdev, tracker->uar); +end: + mlx5vf_state_mutex_unlock(mvdev); + return err; +} diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8208f4701a90..e71ec017bf04 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -9,6 +9,8 @@ #include #include #include +#include +#include struct mlx5vf_async_data { struct mlx5_async_work cb_work; @@ -39,6 +41,52 @@ struct mlx5_vf_migration_file { struct mlx5vf_async_data async_data; }; +struct mlx5_vhca_cq_buf { + struct mlx5_frag_buf_ctrl fbc; + struct mlx5_frag_buf frag_buf; + int cqe_size; + int nent; +}; + +struct mlx5_vhca_cq { + struct mlx5_vhca_cq_buf buf; + struct mlx5_db db; + struct mlx5_core_cq mcq; + size_t ncqe; +}; + +struct mlx5_vhca_recv_buf { + u32 npages; + struct page **page_list; + dma_addr_t *dma_addrs; + u32 next_rq_offset; + u32 mkey; +}; + +struct mlx5_vhca_qp { + struct mlx5_frag_buf buf; + struct mlx5_db db; + struct mlx5_vhca_recv_buf recv_buf; + u32 tracked_page_size; + u32 max_msg_size; + u32 qpn; + struct { + unsigned int pc; + unsigned int cc; + unsigned int wqe_cnt; + __be32 *db; + struct mlx5_frag_buf_ctrl fbc; + } rq; +}; + +struct mlx5_vhca_page_tracker { + u32 pdn; + struct mlx5_uars_page *uar; + struct mlx5_vhca_cq cq; + struct mlx5_vhca_qp *host_qp; + struct mlx5_vhca_qp *fw_qp; +}; + struct mlx5vf_pci_core_device { struct vfio_pci_core_device core_device; int vf_id; @@ -46,6 +94,7 @@ struct mlx5vf_pci_core_device { u8 migrate_cap:1; u8 deferred_reset:1; u8 mdev_detach:1; + u8 log_active:1; /* protect migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; @@ -53,6 +102,7 @@ struct mlx5vf_pci_core_device { spinlock_t reset_lock; struct mlx5_vf_migration_file *resuming_migf; struct mlx5_vf_migration_file *saving_migf; + struct mlx5_vhca_page_tracker tracker; struct workqueue_struct *cb_wq; struct notifier_block nb; struct mlx5_core_dev *mdev; @@ -73,4 +123,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); +int mlx5vf_start_page_tracker(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); +int mlx5vf_stop_page_tracker(struct vfio_device *vdev); #endif /* MLX5_VFIO_CMD_H */ From c1d050b0d169fd60c8acef157db53bd4e3141799 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:45 +0300 Subject: [PATCH 24/77] vfio/mlx5: Create and destroy page tracker object Add support for creating and destroying page tracker object. This object is used to control/report the device dirty pages. As part of creating the tracker need to consider the device capabilities for max ranges and adapt/combine ranges accordingly. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-8-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 147 ++++++++++++++++++++++++++++++++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + 2 files changed, 148 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0a362796d567..f1cad96af6ab 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -410,6 +410,148 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, return err; } +static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, + u32 req_nodes) +{ + struct interval_tree_node *prev, *curr, *comb_start, *comb_end; + unsigned long min_gap; + unsigned long curr_gap; + + /* Special shortcut when a single range is required */ + if (req_nodes == 1) { + unsigned long last; + + curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); + while (curr) { + last = curr->last; + prev = curr; + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); + if (prev != comb_start) + interval_tree_remove(prev, root); + } + comb_start->last = last; + return; + } + + /* Combine ranges which have the smallest gap */ + while (cur_nodes > req_nodes) { + prev = NULL; + min_gap = ULONG_MAX; + curr = interval_tree_iter_first(root, 0, ULONG_MAX); + while (curr) { + if (prev) { + curr_gap = curr->start - prev->last; + if (curr_gap < min_gap) { + min_gap = curr_gap; + comb_start = prev; + comb_end = curr; + } + } + prev = curr; + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); + } + comb_start->last = comb_end->last; + interval_tree_remove(comb_end, root); + cur_nodes--; + } +} + +static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, + struct mlx5vf_pci_core_device *mvdev, + struct rb_root_cached *ranges, u32 nnodes) +{ + int max_num_range = + MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range); + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + int record_size = MLX5_ST_SZ_BYTES(page_track_range); + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + struct interval_tree_node *node = NULL; + u64 total_ranges_len = 0; + u32 num_ranges = nnodes; + u8 log_addr_space_size; + void *range_list_ptr; + void *obj_context; + void *cmd_hdr; + int inlen; + void *in; + int err; + int i; + + if (num_ranges > max_num_range) { + combine_ranges(ranges, nnodes, max_num_range); + num_ranges = max_num_range; + } + + inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) + + record_size * num_ranges; + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in, + general_obj_in_cmd_hdr); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, + MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, + MLX5_OBJ_TYPE_PAGE_TRACK); + obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context); + MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id); + MLX5_SET(page_track, obj_context, track_type, 1); + MLX5_SET(page_track, obj_context, log_page_size, + ilog2(tracker->host_qp->tracked_page_size)); + MLX5_SET(page_track, obj_context, log_msg_size, + ilog2(tracker->host_qp->max_msg_size)); + MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn); + MLX5_SET(page_track, obj_context, num_ranges, num_ranges); + + range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range); + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); + for (i = 0; i < num_ranges; i++) { + void *addr_range_i_base = range_list_ptr + record_size * i; + unsigned long length = node->last - node->start; + + MLX5_SET64(page_track_range, addr_range_i_base, start_address, + node->start); + MLX5_SET64(page_track_range, addr_range_i_base, length, length); + total_ranges_len += length; + node = interval_tree_iter_next(node, 0, ULONG_MAX); + } + + WARN_ON(node); + log_addr_space_size = ilog2(total_ranges_len); + if (log_addr_space_size < + (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || + log_addr_space_size > + (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) { + err = -EOPNOTSUPP; + goto out; + } + + MLX5_SET(page_track, obj_context, log_addr_space_size, + log_addr_space_size); + err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out)); + if (err) + goto out; + + tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); +out: + kfree(in); + return err; +} + +static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, + u32 tracker_id) +{ + u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + + MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, struct mlx5_vhca_cq_buf *buf, int nent, int cqe_size) @@ -833,6 +975,7 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) WARN_ON(mvdev->mdev_detach); + mlx5vf_cmd_destroy_tracker(mdev, tracker->id); mlx5vf_destroy_qp(mdev, tracker->fw_qp); mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); mlx5vf_destroy_qp(mdev, tracker->host_qp); @@ -941,6 +1084,10 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, tracker->host_qp = host_qp; tracker->fw_qp = fw_qp; + err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes); + if (err) + goto err_activate; + *page_size = host_qp->tracked_page_size; mvdev->log_active = true; mlx5vf_state_mutex_unlock(mvdev); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index e71ec017bf04..658925ba5459 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -80,6 +80,7 @@ struct mlx5_vhca_qp { }; struct mlx5_vhca_page_tracker { + u32 id; u32 pdn; struct mlx5_uars_page *uar; struct mlx5_vhca_cq cq; From 1047797e8ed4bb8c20d1b5b843cf870d00bb0ff7 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:46 +0300 Subject: [PATCH 25/77] vfio/mlx5: Report dirty pages from tracker Report dirty pages from tracker. It includes: Querying for dirty pages in a given IOVA range, this is done by modifying the tracker into the reporting state and supplying the required range. Using the CQ event completion mechanism to be notified once data is ready on the CQ/QP to be processed. Once data is available turn on the corresponding bits in the bit map. This functionality will be used as part of the 'log_read_and_clear' driver callback in the next patches. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-9-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 191 ++++++++++++++++++++++++++++++++++++ drivers/vfio/pci/mlx5/cmd.h | 4 + 2 files changed, 195 insertions(+) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index f1cad96af6ab..fa9ddd926500 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -5,6 +5,8 @@ #include "cmd.h" +enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; + static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); static void @@ -157,6 +159,7 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; mvdev->core_device.vdev.mig_ops = mig_ops; + init_completion(&mvdev->tracker_comp); end: mlx5_vf_put_core_dev(mvdev->mdev); @@ -552,6 +555,29 @@ static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev, return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); } +static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev, + u32 tracker_id, unsigned long iova, + unsigned long length, u32 tracker_state) +{ + u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {}; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + void *obj_context; + void *cmd_hdr; + + cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK); + MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id); + + obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context); + MLX5_SET64(page_track, obj_context, modify_field_select, 0x3); + MLX5_SET64(page_track, obj_context, range_start_address, iova); + MLX5_SET64(page_track, obj_context, length, length); + MLX5_SET(page_track, obj_context, state, tracker_state); + + return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); +} + static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev, struct mlx5_vhca_cq_buf *buf, int nent, int cqe_size) @@ -593,6 +619,16 @@ static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, mlx5_db_free(mdev, &cq->db); } +static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, + struct mlx5_eqe *eqe) +{ + struct mlx5vf_pci_core_device *mvdev = + container_of(mcq, struct mlx5vf_pci_core_device, + tracker.cq.mcq); + + complete(&mvdev->tracker_comp); +} + static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, struct mlx5_vhca_page_tracker *tracker, size_t ncqe) @@ -643,10 +679,13 @@ static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); + cq->mcq.comp = mlx5vf_cq_complete; err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); if (err) goto err_vec; + mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, + cq->mcq.cons_index); kvfree(in); return 0; @@ -1109,3 +1148,155 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, mlx5vf_state_mutex_unlock(mvdev); return err; } + +static void +set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, + struct iova_bitmap *dirty) +{ + u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); + u32 nent = size / entry_size; + struct page *page; + u64 addr; + u64 *buf; + int i; + + if (WARN_ON(index >= qp->recv_buf.npages || + (nent > qp->max_msg_size / entry_size))) + return; + + page = qp->recv_buf.page_list[index]; + buf = kmap_local_page(page); + for (i = 0; i < nent; i++) { + addr = MLX5_GET(page_track_report_entry, buf + i, + dirty_address_low); + addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, + dirty_address_high) << 32; + iova_bitmap_set(dirty, addr, qp->tracked_page_size); + } + kunmap_local(buf); +} + +static void +mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe, + struct iova_bitmap *dirty, int *tracker_status) +{ + u32 size; + int ix; + + qp->rq.cc++; + *tracker_status = be32_to_cpu(cqe->immediate) >> 28; + size = be32_to_cpu(cqe->byte_cnt); + ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1); + + /* zero length CQE, no data */ + WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING); + if (size) + set_report_output(size, ix, qp, dirty); + + qp->recv_buf.next_rq_offset = ix * qp->max_msg_size; + mlx5vf_post_recv(qp); +} + +static void *get_cqe(struct mlx5_vhca_cq *cq, int n) +{ + return mlx5_frag_buf_get_wqe(&cq->buf.fbc, n); +} + +static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n) +{ + void *cqe = get_cqe(cq, n & (cq->ncqe - 1)); + struct mlx5_cqe64 *cqe64; + + cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; + + if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) { + return cqe64; + } else { + return NULL; + } +} + +static int +mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp, + struct iova_bitmap *dirty, int *tracker_status) +{ + struct mlx5_cqe64 *cqe; + u8 opcode; + + cqe = get_sw_cqe(cq, cq->mcq.cons_index); + if (!cqe) + return CQ_EMPTY; + + ++cq->mcq.cons_index; + /* + * Make sure we read CQ entry contents after we've checked the + * ownership bit. + */ + rmb(); + opcode = get_cqe_opcode(cqe); + switch (opcode) { + case MLX5_CQE_RESP_SEND_IMM: + mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status); + return CQ_OK; + default: + return CQ_POLL_ERR; + } +} + +int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, + unsigned long length, + struct iova_bitmap *dirty) +{ + struct mlx5vf_pci_core_device *mvdev = container_of( + vdev, struct mlx5vf_pci_core_device, core_device.vdev); + struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker; + struct mlx5_vhca_cq *cq = &tracker->cq; + struct mlx5_core_dev *mdev; + int poll_err, err; + + mutex_lock(&mvdev->state_mutex); + if (!mvdev->log_active) { + err = -EINVAL; + goto end; + } + + if (mvdev->mdev_detach) { + err = -ENOTCONN; + goto end; + } + + mdev = mvdev->mdev; + err = mlx5vf_cmd_modify_tracker(mdev, tracker->id, iova, length, + MLX5_PAGE_TRACK_STATE_REPORTING); + if (err) + goto end; + + tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; + while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING) { + poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, + &tracker->status); + if (poll_err == CQ_EMPTY) { + mlx5_cq_arm(&cq->mcq, MLX5_CQ_DB_REQ_NOT, tracker->uar->map, + cq->mcq.cons_index); + poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, + dirty, &tracker->status); + if (poll_err == CQ_EMPTY) { + wait_for_completion(&mvdev->tracker_comp); + continue; + } + } + if (poll_err == CQ_POLL_ERR) { + err = -EIO; + goto end; + } + mlx5_cq_set_ci(&cq->mcq); + } + + if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) + err = -EIO; + +end: + mlx5vf_state_mutex_unlock(mvdev); + return err; +} diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 658925ba5459..fa1f9ab4d3d0 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -86,6 +86,7 @@ struct mlx5_vhca_page_tracker { struct mlx5_vhca_cq cq; struct mlx5_vhca_qp *host_qp; struct mlx5_vhca_qp *fw_qp; + int status; }; struct mlx5vf_pci_core_device { @@ -96,6 +97,7 @@ struct mlx5vf_pci_core_device { u8 deferred_reset:1; u8 mdev_detach:1; u8 log_active:1; + struct completion tracker_comp; /* protect migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; @@ -127,4 +129,6 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work); int mlx5vf_start_page_tracker(struct vfio_device *vdev, struct rb_root_cached *ranges, u32 nnodes, u64 *page_size); int mlx5vf_stop_page_tracker(struct vfio_device *vdev); +int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, + unsigned long length, struct iova_bitmap *dirty); #endif /* MLX5_VFIO_CMD_H */ From e295738756ebd0726f1ed51e02f343a37233437b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:47 +0300 Subject: [PATCH 26/77] vfio/mlx5: Manage error scenarios on tracker Handle async error events and health/recovery flow to safely stop the tracker upon error scenarios. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-10-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 61 +++++++++++++++++++++++++++++++++++-- drivers/vfio/pci/mlx5/cmd.h | 2 ++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index fa9ddd926500..3e92b4d92be2 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -70,6 +70,13 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, return 0; } +static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev) +{ + /* Mark the tracker under an error and wake it up if it's running */ + mvdev->tracker.is_err = true; + complete(&mvdev->tracker_comp); +} + static int mlx5fv_vf_event(struct notifier_block *nb, unsigned long event, void *data) { @@ -100,6 +107,8 @@ void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev) if (!mvdev->migrate_cap) return; + /* Must be done outside the lock to let it progress */ + set_tracker_error(mvdev); mutex_lock(&mvdev->state_mutex); mlx5vf_disable_fds(mvdev); _mlx5vf_free_page_tracker_resources(mvdev); @@ -619,6 +628,47 @@ static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev, mlx5_db_free(mdev, &cq->db); } +static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type) +{ + if (type != MLX5_EVENT_TYPE_CQ_ERROR) + return; + + set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device, + tracker.cq.mcq)); +} + +static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type, + void *data) +{ + struct mlx5_vhca_page_tracker *tracker = + mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb); + struct mlx5vf_pci_core_device *mvdev = container_of( + tracker, struct mlx5vf_pci_core_device, tracker); + struct mlx5_eqe *eqe = data; + u8 event_type = (u8)type; + u8 queue_type; + int qp_num; + + switch (event_type) { + case MLX5_EVENT_TYPE_WQ_CATAS_ERROR: + case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: + case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: + queue_type = eqe->data.qp_srq.type; + if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP) + break; + qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + if (qp_num != tracker->host_qp->qpn && + qp_num != tracker->fw_qp->qpn) + break; + set_tracker_error(mvdev); + break; + default: + break; + } + + return NOTIFY_OK; +} + static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) { @@ -680,6 +730,7 @@ static int mlx5vf_create_cq(struct mlx5_core_dev *mdev, pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas); mlx5_fill_page_frag_array(&cq->buf.frag_buf, pas); cq->mcq.comp = mlx5vf_cq_complete; + cq->mcq.event = mlx5vf_cq_event; err = mlx5_core_create_cq(mdev, &cq->mcq, in, inlen, out, sizeof(out)); if (err) goto err_vec; @@ -1014,6 +1065,7 @@ _mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev) WARN_ON(mvdev->mdev_detach); + mlx5_eq_notifier_unregister(mdev, &tracker->nb); mlx5vf_cmd_destroy_tracker(mdev, tracker->id); mlx5vf_destroy_qp(mdev, tracker->fw_qp); mlx5vf_free_qp_recv_resources(mdev, tracker->host_qp); @@ -1127,6 +1179,8 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, if (err) goto err_activate; + MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY); + mlx5_eq_notifier_register(mdev, &tracker->nb); *page_size = host_qp->tracked_page_size; mvdev->log_active = true; mlx5vf_state_mutex_unlock(mvdev); @@ -1273,7 +1327,8 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, goto end; tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING; - while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING) { + while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING && + !tracker->is_err) { poll_err = mlx5vf_cq_poll_one(cq, tracker->host_qp, dirty, &tracker->status); if (poll_err == CQ_EMPTY) { @@ -1294,8 +1349,10 @@ int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova, } if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR) - err = -EIO; + tracker->is_err = true; + if (tracker->is_err) + err = -EIO; end: mlx5vf_state_mutex_unlock(mvdev); return err; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index fa1f9ab4d3d0..8b0ae40c620c 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -82,10 +82,12 @@ struct mlx5_vhca_qp { struct mlx5_vhca_page_tracker { u32 id; u32 pdn; + u8 is_err:1; struct mlx5_uars_page *uar; struct mlx5_vhca_cq cq; struct mlx5_vhca_qp *host_qp; struct mlx5_vhca_qp *fw_qp; + struct mlx5_nb nb; int status; }; From f39856aacb078c1c93acef011a37121b17d54fe0 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 8 Sep 2022 21:34:48 +0300 Subject: [PATCH 27/77] vfio/mlx5: Set the driver DMA logging callbacks Now that everything is ready set the driver DMA logging callbacks if supported by the device. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20220908183448.195262-11-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 5 ++++- drivers/vfio/pci/mlx5/cmd.h | 3 ++- drivers/vfio/pci/mlx5/main.c | 9 ++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 3e92b4d92be2..c604b70437a5 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -126,7 +126,8 @@ void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev) } void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, - const struct vfio_migration_ops *mig_ops) + const struct vfio_migration_ops *mig_ops, + const struct vfio_log_ops *log_ops) { struct pci_dev *pdev = mvdev->core_device.pdev; int ret; @@ -169,6 +170,8 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, VFIO_MIGRATION_P2P; mvdev->core_device.vdev.mig_ops = mig_ops; init_completion(&mvdev->tracker_comp); + if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization)) + mvdev->core_device.vdev.log_ops = log_ops; end: mlx5_vf_put_core_dev(mvdev->mdev); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8b0ae40c620c..921d5720a1e5 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -118,7 +118,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, - const struct vfio_migration_ops *mig_ops); + const struct vfio_migration_ops *mig_ops, + const struct vfio_log_ops *log_ops); void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev); int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index a9b63d15c5d3..759a5f5f7b3f 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -579,6 +579,12 @@ static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { .migration_get_state = mlx5vf_pci_get_device_state, }; +static const struct vfio_log_ops mlx5vf_pci_log_ops = { + .log_start = mlx5vf_start_page_tracker, + .log_stop = mlx5vf_stop_page_tracker, + .log_read_and_clear = mlx5vf_tracker_read_and_clear, +}; + static const struct vfio_device_ops mlx5vf_pci_ops = { .name = "mlx5-vfio-pci", .open_device = mlx5vf_pci_open_device, @@ -602,7 +608,8 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, if (!mvdev) return -ENOMEM; vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); - mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops); + mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, + &mlx5vf_pci_log_ops); dev_set_drvdata(&pdev->dev, &mvdev->core_device); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) From cb9ff3f3b84c95867856c3be42de73972feb1249 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:47 +0800 Subject: [PATCH 28/77] vfio: Add helpers for unifying vfio_device life cycle The idea is to let vfio core manage the vfio_device life cycle instead of duplicating the logic cross drivers. This is also a preparatory step for adding struct device into vfio_device. New pair of helpers together with a kref in vfio_device: - vfio_alloc_device() - vfio_put_device() Drivers can register @init/@release callbacks to manage any private state wrapping the vfio_device. However vfio-ccw doesn't fit this model due to a life cycle mess that its private structure mixes both parent and mdev info hence must be allocated/freed outside of the life cycle of vfio device. Per prior discussions this won't be fixed in short term by IBM folks. Instead of waiting for those modifications introduce another helper vfio_init_device() so ccw can call it to initialize a pre-allocated vfio_device. Further implication of the ccw trick is that vfio_device cannot be freed uniformly in vfio core. Instead, require *EVERY* driver to implement @release and free vfio_device inside. Then ccw can choose to delay the free at its own discretion. Another trick down the road is that kvzalloc() is used to accommodate the need of gvt which uses vzalloc() while all others use kzalloc(). So drivers should call a helper vfio_free_device() to free the vfio_device instead of assuming that kfree() or vfree() is appliable. Later once the ccw mess is fixed we can remove those tricks and fully handle structure alloc/free in vfio core. Existing vfio_{un}init_group_dev() will be deprecated after all existing usages are converted to the new model. Suggested-by: Jason Gunthorpe Co-developed-by: Yi Liu Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Tony Krowiak Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-2-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 92 ++++++++++++++++++++++++++++++++++++++++ include/linux/vfio.h | 25 ++++++++++- 2 files changed, 116 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 27d9186f35d5..b9c6a97d647a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -498,6 +498,98 @@ void vfio_uninit_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); +/* Release helper called by vfio_put_device() */ +void vfio_device_release(struct kref *kref) +{ + struct vfio_device *device = + container_of(kref, struct vfio_device, kref); + + vfio_uninit_group_dev(device); + + /* + * kvfree() cannot be done here due to a life cycle mess in + * vfio-ccw. Before the ccw part is fixed all drivers are + * required to support @release and call vfio_free_device() + * from there. + */ + device->ops->release(device); +} +EXPORT_SYMBOL_GPL(vfio_device_release); + +/* + * Allocate and initialize vfio_device so it can be registered to vfio + * core. + * + * Drivers should use the wrapper vfio_alloc_device() for allocation. + * @size is the size of the structure to be allocated, including any + * private data used by the driver. + * + * Driver may provide an @init callback to cover device private data. + * + * Use vfio_put_device() to release the structure after success return. + */ +struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, + const struct vfio_device_ops *ops) +{ + struct vfio_device *device; + int ret; + + if (WARN_ON(size < sizeof(struct vfio_device))) + return ERR_PTR(-EINVAL); + + device = kvzalloc(size, GFP_KERNEL); + if (!device) + return ERR_PTR(-ENOMEM); + + ret = vfio_init_device(device, dev, ops); + if (ret) + goto out_free; + return device; + +out_free: + kvfree(device); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(_vfio_alloc_device); + +/* + * Initialize a vfio_device so it can be registered to vfio core. + * + * Only vfio-ccw driver should call this interface. + */ +int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops) +{ + int ret; + + vfio_init_group_dev(device, dev, ops); + + if (ops->init) { + ret = ops->init(device); + if (ret) + goto out_uninit; + } + + kref_init(&device->kref); + return 0; + +out_uninit: + vfio_uninit_group_dev(device); + return ret; +} +EXPORT_SYMBOL_GPL(vfio_init_device); + +/* + * The helper called by driver @release callback to free the device + * structure. Drivers which don't have private data to clean can + * simply use this helper as its @release. + */ +void vfio_free_device(struct vfio_device *device) +{ + kvfree(device); +} +EXPORT_SYMBOL_GPL(vfio_free_device); + static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, enum vfio_group_type type) { diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 0e2826559091..f67cac700e6f 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -47,7 +47,8 @@ struct vfio_device { struct kvm *kvm; /* Members below here are private, not for driver use */ - refcount_t refcount; + struct kref kref; /* object life cycle */ + refcount_t refcount; /* user count on registered device*/ unsigned int open_count; struct completion comp; struct list_head group_next; @@ -57,6 +58,8 @@ struct vfio_device { /** * struct vfio_device_ops - VFIO bus driver device callbacks * + * @init: initialize private fields in device structure + * @release: Reclaim private fields in device structure * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -74,6 +77,8 @@ struct vfio_device { */ struct vfio_device_ops { char *name; + int (*init)(struct vfio_device *vdev); + void (*release)(struct vfio_device *vdev); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -161,6 +166,24 @@ static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops, return 1; } +struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, + const struct vfio_device_ops *ops); +#define vfio_alloc_device(dev_struct, member, dev, ops) \ + container_of(_vfio_alloc_device(sizeof(struct dev_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + struct dev_struct, member)), \ + dev, ops), \ + struct dev_struct, member) + +int vfio_init_device(struct vfio_device *device, struct device *dev, + const struct vfio_device_ops *ops); +void vfio_free_device(struct vfio_device *device); +void vfio_device_release(struct kref *kref); +static inline void vfio_put_device(struct vfio_device *device) +{ + kref_put(&device->kref, vfio_device_release); +} + void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_uninit_group_dev(struct vfio_device *device); From 63d7c77989de98d3e92611dbb858028b74dca377 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:48 +0800 Subject: [PATCH 29/77] vfio/pci: Use the new device life cycle helpers Also introduce two pci core helpers as @init/@release for pci drivers: - vfio_pci_core_init_dev() - vfio_pci_core_release_dev() Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-3-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 20 +++++++++--------- drivers/vfio/pci/vfio_pci_core.c | 35 ++++++++++++++++++++++++++++++++ include/linux/vfio_pci_core.h | 2 ++ 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index d9b5c03f8d5b..1d4919edfbde 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -127,6 +127,8 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) static const struct vfio_device_ops vfio_pci_ops = { .name = "vfio-pci", + .init = vfio_pci_core_init_dev, + .release = vfio_pci_core_release_dev, .open_device = vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, @@ -146,20 +148,19 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (vfio_pci_is_denylisted(pdev)) return -EINVAL; - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - vfio_pci_core_init_device(vdev, pdev, &vfio_pci_ops); + vdev = vfio_alloc_device(vfio_pci_core_device, vdev, &pdev->dev, + &vfio_pci_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); dev_set_drvdata(&pdev->dev, vdev); ret = vfio_pci_core_register_device(vdev); if (ret) - goto out_free; + goto out_put_vdev; return 0; -out_free: - vfio_pci_core_uninit_device(vdev); - kfree(vdev); +out_put_vdev: + vfio_put_device(&vdev->vdev); return ret; } @@ -168,8 +169,7 @@ static void vfio_pci_remove(struct pci_dev *pdev) struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev); vfio_pci_core_unregister_device(vdev); - vfio_pci_core_uninit_device(vdev); - kfree(vdev); + vfio_put_device(&vdev->vdev); } static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 0a801aee2f2d..77d33739c6e8 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2078,6 +2078,41 @@ static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev) VGA_RSRC_LEGACY_MEM); } +int vfio_pci_core_init_dev(struct vfio_device *core_vdev) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + + vdev->pdev = to_pci_dev(core_vdev->dev); + vdev->irq_type = VFIO_PCI_NUM_IRQS; + mutex_init(&vdev->igate); + spin_lock_init(&vdev->irqlock); + mutex_init(&vdev->ioeventfds_lock); + INIT_LIST_HEAD(&vdev->dummy_resources_list); + INIT_LIST_HEAD(&vdev->ioeventfds_list); + mutex_init(&vdev->vma_lock); + INIT_LIST_HEAD(&vdev->vma_list); + INIT_LIST_HEAD(&vdev->sriov_pfs_item); + init_rwsem(&vdev->memory_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev); + +void vfio_pci_core_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + + mutex_destroy(&vdev->igate); + mutex_destroy(&vdev->ioeventfds_lock); + mutex_destroy(&vdev->vma_lock); + kfree(vdev->region); + kfree(vdev->pm_save); + vfio_free_device(core_vdev); +} +EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); + void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, struct pci_dev *pdev, const struct vfio_device_ops *vfio_pci_ops) diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 089b603bcfdc..0499ea836058 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -109,6 +109,8 @@ void vfio_pci_core_close_device(struct vfio_device *core_vdev); void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, struct pci_dev *pdev, const struct vfio_device_ops *vfio_pci_ops); +int vfio_pci_core_init_dev(struct vfio_device *core_vdev); +void vfio_pci_core_release_dev(struct vfio_device *core_vdev); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); From d3966e305ac4e0b5a63f784d9152fac4961554de Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:49 +0800 Subject: [PATCH 30/77] vfio/mlx5: Use the new device life cycle helpers mlx5 has its own @init/@release for handling migration cap. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-4-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/main.c | 50 ++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 759a5f5f7b3f..fd6ccb8454a2 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -585,8 +585,35 @@ static const struct vfio_log_ops mlx5vf_pci_log_ops = { .log_read_and_clear = mlx5vf_tracker_read_and_clear, }; +static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, + struct mlx5vf_pci_core_device, core_device.vdev); + int ret; + + ret = vfio_pci_core_init_dev(core_vdev); + if (ret) + return ret; + + mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, + &mlx5vf_pci_log_ops); + + return 0; +} + +static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) +{ + struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, + struct mlx5vf_pci_core_device, core_device.vdev); + + mlx5vf_cmd_remove_migratable(mvdev); + vfio_pci_core_release_dev(core_vdev); +} + static const struct vfio_device_ops mlx5vf_pci_ops = { .name = "mlx5-vfio-pci", + .init = mlx5vf_pci_init_dev, + .release = mlx5vf_pci_release_dev, .open_device = mlx5vf_pci_open_device, .close_device = mlx5vf_pci_close_device, .ioctl = vfio_pci_core_ioctl, @@ -604,22 +631,19 @@ static int mlx5vf_pci_probe(struct pci_dev *pdev, struct mlx5vf_pci_core_device *mvdev; int ret; - mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL); - if (!mvdev) - return -ENOMEM; - vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); - mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, - &mlx5vf_pci_log_ops); + mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, + &pdev->dev, &mlx5vf_pci_ops); + if (IS_ERR(mvdev)) + return PTR_ERR(mvdev); + dev_set_drvdata(&pdev->dev, &mvdev->core_device); ret = vfio_pci_core_register_device(&mvdev->core_device); if (ret) - goto out_free; + goto out_put_vdev; return 0; -out_free: - mlx5vf_cmd_remove_migratable(mvdev); - vfio_pci_core_uninit_device(&mvdev->core_device); - kfree(mvdev); +out_put_vdev: + vfio_put_device(&mvdev->core_device.vdev); return ret; } @@ -628,9 +652,7 @@ static void mlx5vf_pci_remove(struct pci_dev *pdev) struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); vfio_pci_core_unregister_device(&mvdev->core_device); - mlx5vf_cmd_remove_migratable(mvdev); - vfio_pci_core_uninit_device(&mvdev->core_device); - kfree(mvdev); + vfio_put_device(&mvdev->core_device.vdev); } static const struct pci_device_id mlx5vf_pci_table[] = { From 27aeb915595b87165a3004aab05b2b837d01e6ed Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:50 +0800 Subject: [PATCH 31/77] vfio/hisi_acc: Use the new device life cycle helpers Tidy up @probe so all migration specific initialization logic is moved to migration specific @init callback. Remove vfio_pci_core_{un}init_device() given no user now. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Shameer Kolothum Link: https://lore.kernel.org/r/20220921104401.38898-5-kevin.tian@intel.com Signed-off-by: Alex Williamson --- .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 80 +++++++++---------- drivers/vfio/pci/vfio_pci_core.c | 30 ------- include/linux/vfio_pci_core.h | 4 - 3 files changed, 37 insertions(+), 77 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 258cae0863ea..47174e2b61bd 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1213,8 +1213,28 @@ static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { .migration_get_state = hisi_acc_vfio_pci_get_device_state, }; +static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, + struct hisi_acc_vf_core_device, core_device.vdev); + struct pci_dev *pdev = to_pci_dev(core_vdev->dev); + struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev); + + hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1; + hisi_acc_vdev->pf_qm = pf_qm; + hisi_acc_vdev->vf_dev = pdev; + mutex_init(&hisi_acc_vdev->state_mutex); + + core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY; + core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; + + return vfio_pci_core_init_dev(core_vdev); +} + static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .name = "hisi-acc-vfio-pci-migration", + .init = hisi_acc_vfio_pci_migrn_init_dev, + .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = hisi_acc_vfio_pci_close_device, .ioctl = hisi_acc_vfio_pci_ioctl, @@ -1228,6 +1248,8 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .name = "hisi-acc-vfio-pci", + .init = vfio_pci_core_init_dev, + .release = vfio_pci_core_release_dev, .open_device = hisi_acc_vfio_pci_open_device, .close_device = vfio_pci_core_close_device, .ioctl = vfio_pci_core_ioctl, @@ -1239,63 +1261,36 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .match = vfio_pci_core_match, }; -static int -hisi_acc_vfio_pci_migrn_init(struct hisi_acc_vf_core_device *hisi_acc_vdev, - struct pci_dev *pdev, struct hisi_qm *pf_qm) -{ - int vf_id; - - vf_id = pci_iov_vf_id(pdev); - if (vf_id < 0) - return vf_id; - - hisi_acc_vdev->vf_id = vf_id + 1; - hisi_acc_vdev->core_device.vdev.migration_flags = - VFIO_MIGRATION_STOP_COPY; - hisi_acc_vdev->pf_qm = pf_qm; - hisi_acc_vdev->vf_dev = pdev; - mutex_init(&hisi_acc_vdev->state_mutex); - - return 0; -} - static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct hisi_acc_vf_core_device *hisi_acc_vdev; + const struct vfio_device_ops *ops = &hisi_acc_vfio_pci_ops; struct hisi_qm *pf_qm; + int vf_id; int ret; - hisi_acc_vdev = kzalloc(sizeof(*hisi_acc_vdev), GFP_KERNEL); - if (!hisi_acc_vdev) - return -ENOMEM; - pf_qm = hisi_acc_get_pf_qm(pdev); if (pf_qm && pf_qm->ver >= QM_HW_V3) { - ret = hisi_acc_vfio_pci_migrn_init(hisi_acc_vdev, pdev, pf_qm); - if (!ret) { - vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, - &hisi_acc_vfio_pci_migrn_ops); - hisi_acc_vdev->core_device.vdev.mig_ops = - &hisi_acc_vfio_pci_migrn_state_ops; - } else { + vf_id = pci_iov_vf_id(pdev); + if (vf_id >= 0) + ops = &hisi_acc_vfio_pci_migrn_ops; + else pci_warn(pdev, "migration support failed, continue with generic interface\n"); - vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, - &hisi_acc_vfio_pci_ops); - } - } else { - vfio_pci_core_init_device(&hisi_acc_vdev->core_device, pdev, - &hisi_acc_vfio_pci_ops); } + hisi_acc_vdev = vfio_alloc_device(hisi_acc_vf_core_device, + core_device.vdev, &pdev->dev, ops); + if (IS_ERR(hisi_acc_vdev)) + return PTR_ERR(hisi_acc_vdev); + dev_set_drvdata(&pdev->dev, &hisi_acc_vdev->core_device); ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device); if (ret) - goto out_free; + goto out_put_vdev; return 0; -out_free: - vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); - kfree(hisi_acc_vdev); +out_put_vdev: + vfio_put_device(&hisi_acc_vdev->core_device.vdev); return ret; } @@ -1304,8 +1299,7 @@ static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); - vfio_pci_core_uninit_device(&hisi_acc_vdev->core_device); - kfree(hisi_acc_vdev); + vfio_put_device(&hisi_acc_vdev->core_device.vdev); } static const struct pci_device_id hisi_acc_vfio_pci_table[] = { diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 77d33739c6e8..59a28251bb0b 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2113,36 +2113,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev) } EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev); -void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, - struct pci_dev *pdev, - const struct vfio_device_ops *vfio_pci_ops) -{ - vfio_init_group_dev(&vdev->vdev, &pdev->dev, vfio_pci_ops); - vdev->pdev = pdev; - vdev->irq_type = VFIO_PCI_NUM_IRQS; - mutex_init(&vdev->igate); - spin_lock_init(&vdev->irqlock); - mutex_init(&vdev->ioeventfds_lock); - INIT_LIST_HEAD(&vdev->dummy_resources_list); - INIT_LIST_HEAD(&vdev->ioeventfds_list); - mutex_init(&vdev->vma_lock); - INIT_LIST_HEAD(&vdev->vma_list); - INIT_LIST_HEAD(&vdev->sriov_pfs_item); - init_rwsem(&vdev->memory_lock); -} -EXPORT_SYMBOL_GPL(vfio_pci_core_init_device); - -void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev) -{ - mutex_destroy(&vdev->igate); - mutex_destroy(&vdev->ioeventfds_lock); - mutex_destroy(&vdev->vma_lock); - vfio_uninit_group_dev(&vdev->vdev); - kfree(vdev->region); - kfree(vdev->pm_save); -} -EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); - int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 0499ea836058..367fd79226a3 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -106,13 +106,9 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga, bool is_disable_idle_d3); void vfio_pci_core_close_device(struct vfio_device *core_vdev); -void vfio_pci_core_init_device(struct vfio_pci_core_device *vdev, - struct pci_dev *pdev, - const struct vfio_device_ops *vfio_pci_ops); int vfio_pci_core_init_dev(struct vfio_device *core_vdev); void vfio_pci_core_release_dev(struct vfio_device *core_vdev); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev); -void vfio_pci_core_uninit_device(struct vfio_pci_core_device *vdev); void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev); extern const struct pci_error_handlers vfio_pci_core_err_handlers; int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, From 603c09f2873d6e86dce636c9e5ae330e2c033940 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:51 +0800 Subject: [PATCH 32/77] vfio/mdpy: Use the new device life cycle helpers and manage mdpy_count inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-6-kevin.tian@intel.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mdpy.c | 83 +++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index e8c46eb2e246..bb2af1ec0f7c 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -216,63 +216,79 @@ static int mdpy_reset(struct mdev_state *mdev_state) return 0; } -static int mdpy_probe(struct mdev_device *mdev) +static int mdpy_init_dev(struct vfio_device *vdev) { + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); const struct mdpy_type *type = &mdpy_types[mdev_get_type_group_id(mdev)]; - struct device *dev = mdev_dev(mdev); - struct mdev_state *mdev_state; u32 fbsize; - int ret; + int ret = -ENOMEM; if (mdpy_count >= max_devices) - return -ENOMEM; - - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) - return -ENOMEM; - vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mdpy_dev_ops); + return ret; mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) { - ret = -ENOMEM; - goto err_state; - } + if (!mdev_state->vconfig) + return ret; fbsize = roundup_pow_of_two(type->width * type->height * type->bytepp); mdev_state->memblk = vmalloc_user(fbsize); - if (!mdev_state->memblk) { - ret = -ENOMEM; - goto err_vconfig; - } - dev_info(dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width, - type->height); + if (!mdev_state->memblk) + goto out_vconfig; mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mdev_state->type = type; + mdev_state->type = type; mdev_state->memsize = fbsize; mdpy_create_config_space(mdev_state); mdpy_reset(mdev_state); + dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width, + type->height); + mdpy_count++; + return 0; + +out_vconfig: + kfree(mdev_state->vconfig); + return ret; +} + +static int mdpy_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mdpy_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); if (ret) - goto err_mem; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, mdev_state); return 0; -err_mem: - vfree(mdev_state->memblk); -err_vconfig: - kfree(mdev_state->vconfig); -err_state: - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); return ret; } +static void mdpy_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + mdpy_count--; + vfree(mdev_state->memblk); + kfree(mdev_state->vconfig); + vfio_free_device(vdev); +} + static void mdpy_remove(struct mdev_device *mdev) { struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); @@ -280,12 +296,7 @@ static void mdpy_remove(struct mdev_device *mdev) dev_info(&mdev->dev, "%s\n", __func__); vfio_unregister_group_dev(&mdev_state->vdev); - vfree(mdev_state->memblk); - kfree(mdev_state->vconfig); - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); - - mdpy_count--; + vfio_put_device(&mdev_state->vdev); } static ssize_t mdpy_read(struct vfio_device *vdev, char __user *buf, @@ -708,6 +719,8 @@ static struct attribute_group *mdev_type_groups[] = { }; static const struct vfio_device_ops mdpy_dev_ops = { + .init = mdpy_init_dev, + .release = mdpy_release_dev, .read = mdpy_read, .write = mdpy_write, .ioctl = mdpy_ioctl, From 67c5a1814f4c1ad0e61f11112010057191730853 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:52 +0800 Subject: [PATCH 33/77] vfio/mtty: Use the new device life cycle helpers and manage available ports inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-7-kevin.tian@intel.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mtty.c | 71 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index f42a59ed2e3f..d151928e4f21 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -703,9 +703,11 @@ static ssize_t mdev_access(struct mdev_state *mdev_state, u8 *buf, size_t count, return ret; } -static int mtty_probe(struct mdev_device *mdev) +static int mtty_init_dev(struct vfio_device *vdev) { - struct mdev_state *mdev_state; + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); int nr_ports = mdev_get_type_group_id(mdev) + 1; int avail_ports = atomic_read(&mdev_avail_ports); int ret; @@ -716,58 +718,65 @@ static int mtty_probe(struct mdev_device *mdev) } while (!atomic_try_cmpxchg(&mdev_avail_ports, &avail_ports, avail_ports - nr_ports)); - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) { - ret = -ENOMEM; - goto err_nr_ports; - } - - vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mtty_dev_ops); - mdev_state->nr_ports = nr_ports; mdev_state->irq_index = -1; mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; mutex_init(&mdev_state->rxtx_lock); - mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) { + mdev_state->vconfig = kzalloc(MTTY_CONFIG_SPACE_SIZE, GFP_KERNEL); + if (!mdev_state->vconfig) { ret = -ENOMEM; - goto err_state; + goto err_nr_ports; } mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; - mtty_create_config_space(mdev_state); - - ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); - if (ret) - goto err_vconfig; - dev_set_drvdata(&mdev->dev, mdev_state); return 0; -err_vconfig: - kfree(mdev_state->vconfig); -err_state: - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); err_nr_ports: atomic_add(nr_ports, &mdev_avail_ports); return ret; } +static int mtty_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mtty_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mtty_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + atomic_add(mdev_state->nr_ports, &mdev_avail_ports); + kfree(mdev_state->vconfig); + vfio_free_device(vdev); +} + static void mtty_remove(struct mdev_device *mdev) { struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); - int nr_ports = mdev_state->nr_ports; vfio_unregister_group_dev(&mdev_state->vdev); - - kfree(mdev_state->vconfig); - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state); - atomic_add(nr_ports, &mdev_avail_ports); + vfio_put_device(&mdev_state->vdev); } static int mtty_reset(struct mdev_state *mdev_state) @@ -1287,6 +1296,8 @@ static struct attribute_group *mdev_type_groups[] = { static const struct vfio_device_ops mtty_dev_ops = { .name = "vfio-mtty", + .init = mtty_init_dev, + .release = mtty_release_dev, .read = mtty_read, .write = mtty_write, .ioctl = mtty_ioctl, From 3d5d18e1f899ac3b03e108aef8560f4cb0969da1 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:53 +0800 Subject: [PATCH 34/77] vfio/mbochs: Use the new device life cycle helpers and manage avail_mbytes inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-8-kevin.tian@intel.com Signed-off-by: Alex Williamson --- samples/vfio-mdev/mbochs.c | 75 ++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 344c2901a82b..6901947e27d2 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -505,13 +505,14 @@ static int mbochs_reset(struct mdev_state *mdev_state) return 0; } -static int mbochs_probe(struct mdev_device *mdev) +static int mbochs_init_dev(struct vfio_device *vdev) { - int avail_mbytes = atomic_read(&mbochs_avail_mbytes); + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + struct mdev_device *mdev = to_mdev_device(vdev->dev); const struct mbochs_type *type = &mbochs_types[mdev_get_type_group_id(mdev)]; - struct device *dev = mdev_dev(mdev); - struct mdev_state *mdev_state; + int avail_mbytes = atomic_read(&mbochs_avail_mbytes); int ret = -ENOMEM; do { @@ -520,14 +521,9 @@ static int mbochs_probe(struct mdev_device *mdev) } while (!atomic_try_cmpxchg(&mbochs_avail_mbytes, &avail_mbytes, avail_mbytes - type->mbytes)); - mdev_state = kzalloc(sizeof(struct mdev_state), GFP_KERNEL); - if (mdev_state == NULL) - goto err_avail; - vfio_init_group_dev(&mdev_state->vdev, &mdev->dev, &mbochs_dev_ops); - mdev_state->vconfig = kzalloc(MBOCHS_CONFIG_SPACE_SIZE, GFP_KERNEL); - if (mdev_state->vconfig == NULL) - goto err_mem; + if (!mdev_state->vconfig) + goto err_avail; mdev_state->memsize = type->mbytes * 1024 * 1024; mdev_state->pagecount = mdev_state->memsize >> PAGE_SHIFT; @@ -535,10 +531,7 @@ static int mbochs_probe(struct mdev_device *mdev) sizeof(struct page *), GFP_KERNEL); if (!mdev_state->pages) - goto err_mem; - - dev_info(dev, "%s: %s, %d MB, %ld pages\n", __func__, - type->name, type->mbytes, mdev_state->pagecount); + goto err_vconfig; mutex_init(&mdev_state->ops_lock); mdev_state->mdev = mdev; @@ -553,31 +546,55 @@ static int mbochs_probe(struct mdev_device *mdev) mbochs_create_config_space(mdev_state); mbochs_reset(mdev_state); - ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); - if (ret) - goto err_mem; - dev_set_drvdata(&mdev->dev, mdev_state); + dev_info(vdev->dev, "%s: %s, %d MB, %ld pages\n", __func__, + type->name, type->mbytes, mdev_state->pagecount); return 0; -err_mem: - vfio_uninit_group_dev(&mdev_state->vdev); - kfree(mdev_state->pages); + +err_vconfig: kfree(mdev_state->vconfig); - kfree(mdev_state); err_avail: atomic_add(type->mbytes, &mbochs_avail_mbytes); return ret; } +static int mbochs_probe(struct mdev_device *mdev) +{ + struct mdev_state *mdev_state; + int ret = -ENOMEM; + + mdev_state = vfio_alloc_device(mdev_state, vdev, &mdev->dev, + &mbochs_dev_ops); + if (IS_ERR(mdev_state)) + return PTR_ERR(mdev_state); + + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); + if (ret) + goto err_put_vdev; + dev_set_drvdata(&mdev->dev, mdev_state); + return 0; + +err_put_vdev: + vfio_put_device(&mdev_state->vdev); + return ret; +} + +static void mbochs_release_dev(struct vfio_device *vdev) +{ + struct mdev_state *mdev_state = + container_of(vdev, struct mdev_state, vdev); + + atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); + kfree(mdev_state->pages); + kfree(mdev_state->vconfig); + vfio_free_device(vdev); +} + static void mbochs_remove(struct mdev_device *mdev) { struct mdev_state *mdev_state = dev_get_drvdata(&mdev->dev); vfio_unregister_group_dev(&mdev_state->vdev); - vfio_uninit_group_dev(&mdev_state->vdev); - atomic_add(mdev_state->type->mbytes, &mbochs_avail_mbytes); - kfree(mdev_state->pages); - kfree(mdev_state->vconfig); - kfree(mdev_state); + vfio_put_device(&mdev_state->vdev); } static ssize_t mbochs_read(struct vfio_device *vdev, char __user *buf, @@ -1397,6 +1414,8 @@ static struct attribute_group *mdev_type_groups[] = { static const struct vfio_device_ops mbochs_dev_ops = { .close_device = mbochs_close_device, + .init = mbochs_init_dev, + .release = mbochs_release_dev, .read = mbochs_read, .write = mbochs_write, .ioctl = mbochs_ioctl, From a5ddd2a99a7a393ceb023b83d7e78fbb3284bcfd Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:54 +0800 Subject: [PATCH 35/77] drm/i915/gvt: Use the new device life cycle helpers Move vfio_device to the start of intel_vgpu as required by the new helpers. Change intel_gvt_create_vgpu() to use intel_vgpu as the first param as other vgpu helpers do. Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Zhenyu Wang Link: https://lore.kernel.org/r/20220921104401.38898-9-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/gvt.h | 5 ++- drivers/gpu/drm/i915/gvt/kvmgt.c | 52 ++++++++++++++++++++++---------- drivers/gpu/drm/i915/gvt/vgpu.c | 33 ++++++++------------ 3 files changed, 50 insertions(+), 40 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 705689e64011..89fab7896fc6 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -172,6 +172,7 @@ struct intel_vgpu_submission { #define KVMGT_DEBUGFS_FILENAME "kvmgt_nr_cache_entries" struct intel_vgpu { + struct vfio_device vfio_device; struct intel_gvt *gvt; struct mutex vgpu_lock; int id; @@ -211,7 +212,6 @@ struct intel_vgpu { u32 scan_nonprivbb; - struct vfio_device vfio_device; struct vfio_region *region; int num_regions; struct eventfd_ctx *intx_trigger; @@ -494,8 +494,7 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt); struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt); void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu); -struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type); +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type); void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu); void intel_gvt_release_vgpu(struct intel_vgpu *vgpu); void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index e3cd58946477..41bba40feef8 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1546,7 +1546,33 @@ static const struct attribute_group *intel_vgpu_groups[] = { NULL, }; +static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) +{ + struct mdev_device *mdev = to_mdev_device(vfio_dev->dev); + struct device *pdev = mdev_parent_dev(mdev); + struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt; + struct intel_vgpu_type *type; + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + + type = &gvt->types[mdev_get_type_group_id(mdev)]; + if (!type) + return -EINVAL; + + vgpu->gvt = gvt; + return intel_gvt_create_vgpu(vgpu, type); +} + +static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) +{ + struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + + intel_gvt_destroy_vgpu(vgpu); + vfio_free_device(vfio_dev); +} + static const struct vfio_device_ops intel_vgpu_dev_ops = { + .init = intel_vgpu_init_dev, + .release = intel_vgpu_release_dev, .open_device = intel_vgpu_open_device, .close_device = intel_vgpu_close_device, .read = intel_vgpu_read, @@ -1558,35 +1584,28 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { static int intel_vgpu_probe(struct mdev_device *mdev) { - struct device *pdev = mdev_parent_dev(mdev); - struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt; - struct intel_vgpu_type *type; struct intel_vgpu *vgpu; int ret; - type = &gvt->types[mdev_get_type_group_id(mdev)]; - if (!type) - return -EINVAL; - - vgpu = intel_gvt_create_vgpu(gvt, type); + vgpu = vfio_alloc_device(intel_vgpu, vfio_device, &mdev->dev, + &intel_vgpu_dev_ops); if (IS_ERR(vgpu)) { gvt_err("failed to create intel vgpu: %ld\n", PTR_ERR(vgpu)); return PTR_ERR(vgpu); } - vfio_init_group_dev(&vgpu->vfio_device, &mdev->dev, - &intel_vgpu_dev_ops); - dev_set_drvdata(&mdev->dev, vgpu); ret = vfio_register_emulated_iommu_dev(&vgpu->vfio_device); - if (ret) { - intel_gvt_destroy_vgpu(vgpu); - return ret; - } + if (ret) + goto out_put_vdev; gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n", dev_name(mdev_dev(mdev))); return 0; + +out_put_vdev: + vfio_put_device(&vgpu->vfio_device); + return ret; } static void intel_vgpu_remove(struct mdev_device *mdev) @@ -1595,7 +1614,8 @@ static void intel_vgpu_remove(struct mdev_device *mdev) if (WARN_ON_ONCE(vgpu->attached)) return; - intel_gvt_destroy_vgpu(vgpu); + + vfio_put_device(&vgpu->vfio_device); } static struct mdev_driver intel_vgpu_mdev_driver = { diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 46da19b3225d..5c533fbc2c8d 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -302,8 +302,6 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu) mutex_lock(&gvt->lock); intel_gvt_update_vgpu_types(gvt); mutex_unlock(&gvt->lock); - - vfree(vgpu); } #define IDLE_VGPU_IDR 0 @@ -363,28 +361,23 @@ void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu) vfree(vgpu); } -static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_creation_params *param) +static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, + struct intel_vgpu_creation_params *param) { + struct intel_gvt *gvt = vgpu->gvt; struct drm_i915_private *dev_priv = gvt->gt->i915; - struct intel_vgpu *vgpu; int ret; gvt_dbg_core("low %llu MB high %llu MB fence %llu\n", param->low_gm_sz, param->high_gm_sz, param->fence_sz); - vgpu = vzalloc(sizeof(*vgpu)); - if (!vgpu) - return ERR_PTR(-ENOMEM); - ret = idr_alloc(&gvt->vgpu_idr, vgpu, IDLE_VGPU_IDR + 1, GVT_MAX_VGPU, GFP_KERNEL); if (ret < 0) - goto out_free_vgpu; + return ret; vgpu->id = ret; - vgpu->gvt = gvt; vgpu->sched_ctl.weight = param->weight; mutex_init(&vgpu->vgpu_lock); mutex_init(&vgpu->dmabuf_lock); @@ -437,7 +430,7 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, if (ret) goto out_clean_sched_policy; - return vgpu; + return 0; out_clean_sched_policy: intel_vgpu_clean_sched_policy(vgpu); @@ -455,9 +448,7 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, intel_vgpu_clean_mmio(vgpu); out_clean_idr: idr_remove(&gvt->vgpu_idr, vgpu->id); -out_free_vgpu: - vfree(vgpu); - return ERR_PTR(ret); + return ret; } /** @@ -470,11 +461,11 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, * Returns: * pointer to intel_vgpu, error pointer if failed. */ -struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, - struct intel_vgpu_type *type) +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type) { + struct intel_gvt *gvt = vgpu->gvt; struct intel_vgpu_creation_params param; - struct intel_vgpu *vgpu; + int ret; param.primary = 1; param.low_gm_sz = type->low_gm_size; @@ -488,15 +479,15 @@ struct intel_vgpu *intel_gvt_create_vgpu(struct intel_gvt *gvt, param.high_gm_sz = BYTES_TO_MB(param.high_gm_sz); mutex_lock(&gvt->lock); - vgpu = __intel_gvt_create_vgpu(gvt, ¶m); - if (!IS_ERR(vgpu)) { + ret = __intel_gvt_create_vgpu(vgpu, ¶m); + if (!ret) { /* calculate left instance change for types */ intel_gvt_update_vgpu_types(gvt); intel_gvt_update_reg_whitelist(vgpu); } mutex_unlock(&gvt->lock); - return vgpu; + return ret; } /** From 7cb5a82eb162d268f65c7b0fbec4a5f6495bab79 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:55 +0800 Subject: [PATCH 36/77] vfio/ap: Use the new device life cycle helpers and manage available_instances inside @init/@release. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Tony Krowiak Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-10-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/s390/crypto/vfio_ap_ops.c | 50 ++++++++++++++++++------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 6c8c41fac4e1..161597357a64 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -684,42 +684,44 @@ static bool vfio_ap_mdev_filter_matrix(unsigned long *apm, unsigned long *aqm, AP_DOMAINS); } -static int vfio_ap_mdev_probe(struct mdev_device *mdev) +static int vfio_ap_mdev_init_dev(struct vfio_device *vdev) { - struct ap_matrix_mdev *matrix_mdev; - int ret; + struct ap_matrix_mdev *matrix_mdev = + container_of(vdev, struct ap_matrix_mdev, vdev); if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0)) return -EPERM; - matrix_mdev = kzalloc(sizeof(*matrix_mdev), GFP_KERNEL); - if (!matrix_mdev) { - ret = -ENOMEM; - goto err_dec_available; - } - vfio_init_group_dev(&matrix_mdev->vdev, &mdev->dev, - &vfio_ap_matrix_dev_ops); - - matrix_mdev->mdev = mdev; + matrix_mdev->mdev = to_mdev_device(vdev->dev); vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); matrix_mdev->pqap_hook = handle_pqap; vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->shadow_apcb); hash_init(matrix_mdev->qtable.queues); + return 0; +} + +static int vfio_ap_mdev_probe(struct mdev_device *mdev) +{ + struct ap_matrix_mdev *matrix_mdev; + int ret; + + matrix_mdev = vfio_alloc_device(ap_matrix_mdev, vdev, &mdev->dev, + &vfio_ap_matrix_dev_ops); + if (IS_ERR(matrix_mdev)) + return PTR_ERR(matrix_mdev); + ret = vfio_register_emulated_iommu_dev(&matrix_mdev->vdev); if (ret) - goto err_list; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, matrix_mdev); mutex_lock(&matrix_dev->mdevs_lock); list_add(&matrix_mdev->node, &matrix_dev->mdev_list); mutex_unlock(&matrix_dev->mdevs_lock); return 0; -err_list: - vfio_uninit_group_dev(&matrix_mdev->vdev); - kfree(matrix_mdev); -err_dec_available: - atomic_inc(&matrix_dev->available_instances); +err_put_vdev: + vfio_put_device(&matrix_mdev->vdev); return ret; } @@ -766,6 +768,12 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev) } } +static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) +{ + atomic_inc(&matrix_dev->available_instances); + vfio_free_device(vdev); +} + static void vfio_ap_mdev_remove(struct mdev_device *mdev) { struct ap_matrix_mdev *matrix_mdev = dev_get_drvdata(&mdev->dev); @@ -779,9 +787,7 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) list_del(&matrix_mdev->node); mutex_unlock(&matrix_dev->mdevs_lock); mutex_unlock(&matrix_dev->guests_lock); - vfio_uninit_group_dev(&matrix_mdev->vdev); - kfree(matrix_mdev); - atomic_inc(&matrix_dev->available_instances); + vfio_put_device(&matrix_mdev->vdev); } static ssize_t name_show(struct mdev_type *mtype, @@ -1794,6 +1800,8 @@ static const struct attribute_group vfio_queue_attr_group = { }; static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { + .init = vfio_ap_mdev_init_dev, + .release = vfio_ap_mdev_release_dev, .open_device = vfio_ap_mdev_open_device, .close_device = vfio_ap_mdev_close_device, .ioctl = vfio_ap_mdev_ioctl, From 7566692c571dced7208b7cc26c1d3b898a233487 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:43:56 +0800 Subject: [PATCH 37/77] vfio/fsl-mc: Use the new device life cycle helpers Also add a comment to mark that vfio core releases device_set if @init fails. Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-11-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 93 ++++++++++++++++++------------- 1 file changed, 53 insertions(+), 40 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 42b344bd7cd5..b16874e913e4 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -418,16 +418,7 @@ static int vfio_fsl_mc_mmap(struct vfio_device *core_vdev, return vfio_fsl_mc_mmap_mmio(vdev->regions[index], vma); } -static const struct vfio_device_ops vfio_fsl_mc_ops = { - .name = "vfio-fsl-mc", - .open_device = vfio_fsl_mc_open_device, - .close_device = vfio_fsl_mc_close_device, - .ioctl = vfio_fsl_mc_ioctl, - .read = vfio_fsl_mc_read, - .write = vfio_fsl_mc_write, - .mmap = vfio_fsl_mc_mmap, -}; - +static const struct vfio_device_ops vfio_fsl_mc_ops; static int vfio_fsl_mc_bus_notifier(struct notifier_block *nb, unsigned long action, void *data) { @@ -518,35 +509,43 @@ static void vfio_fsl_uninit_device(struct vfio_fsl_mc_device *vdev) bus_unregister_notifier(&fsl_mc_bus_type, &vdev->nb); } +static int vfio_fsl_mc_init_dev(struct vfio_device *core_vdev) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + struct fsl_mc_device *mc_dev = to_fsl_mc_device(core_vdev->dev); + int ret; + + vdev->mc_dev = mc_dev; + mutex_init(&vdev->igate); + + if (is_fsl_mc_bus_dprc(mc_dev)) + ret = vfio_assign_device_set(core_vdev, &mc_dev->dev); + else + ret = vfio_assign_device_set(core_vdev, mc_dev->dev.parent); + + if (ret) + return ret; + + /* device_set is released by vfio core if @init fails */ + return vfio_fsl_mc_init_device(vdev); +} + static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) { struct vfio_fsl_mc_device *vdev; struct device *dev = &mc_dev->dev; int ret; - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - - vfio_init_group_dev(&vdev->vdev, dev, &vfio_fsl_mc_ops); - vdev->mc_dev = mc_dev; - mutex_init(&vdev->igate); - - if (is_fsl_mc_bus_dprc(mc_dev)) - ret = vfio_assign_device_set(&vdev->vdev, &mc_dev->dev); - else - ret = vfio_assign_device_set(&vdev->vdev, mc_dev->dev.parent); - if (ret) - goto out_uninit; - - ret = vfio_fsl_mc_init_device(vdev); - if (ret) - goto out_uninit; + vdev = vfio_alloc_device(vfio_fsl_mc_device, vdev, dev, + &vfio_fsl_mc_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); ret = vfio_register_group_dev(&vdev->vdev); if (ret) { dev_err(dev, "VFIO_FSL_MC: Failed to add to vfio group\n"); - goto out_device; + goto out_put_vdev; } ret = vfio_fsl_mc_scan_container(mc_dev); @@ -557,30 +556,44 @@ static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) out_group_dev: vfio_unregister_group_dev(&vdev->vdev); -out_device: - vfio_fsl_uninit_device(vdev); -out_uninit: - vfio_uninit_group_dev(&vdev->vdev); - kfree(vdev); +out_put_vdev: + vfio_put_device(&vdev->vdev); return ret; } +static void vfio_fsl_mc_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_fsl_mc_device *vdev = + container_of(core_vdev, struct vfio_fsl_mc_device, vdev); + + vfio_fsl_uninit_device(vdev); + mutex_destroy(&vdev->igate); + vfio_free_device(core_vdev); +} + static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) { struct device *dev = &mc_dev->dev; struct vfio_fsl_mc_device *vdev = dev_get_drvdata(dev); vfio_unregister_group_dev(&vdev->vdev); - mutex_destroy(&vdev->igate); - dprc_remove_devices(mc_dev, NULL, 0); - vfio_fsl_uninit_device(vdev); - - vfio_uninit_group_dev(&vdev->vdev); - kfree(vdev); + vfio_put_device(&vdev->vdev); return 0; } +static const struct vfio_device_ops vfio_fsl_mc_ops = { + .name = "vfio-fsl-mc", + .init = vfio_fsl_mc_init_dev, + .release = vfio_fsl_mc_release_dev, + .open_device = vfio_fsl_mc_open_device, + .close_device = vfio_fsl_mc_close_device, + .ioctl = vfio_fsl_mc_ioctl, + .read = vfio_fsl_mc_read, + .write = vfio_fsl_mc_write, + .mmap = vfio_fsl_mc_mmap, +}; + static struct fsl_mc_driver vfio_fsl_mc_driver = { .probe = vfio_fsl_mc_probe, .remove = vfio_fsl_mc_remove, From 5f6c7e0831a1f1faffad43bb8dbc260b49f2d3dc Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:57 +0800 Subject: [PATCH 38/77] vfio/platform: Use the new device life cycle helpers Move vfio_device_ops from platform core to platform drivers so device specific init/cleanup can be added. Introduce two new helpers vfio_platform_init/release_common() for the use in driver @init/@release. vfio_platform_probe/remove_common() will be deprecated. Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Tested-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-12-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform.c | 66 +++++++++++++++---- drivers/vfio/platform/vfio_platform_common.c | 53 ++++++++++++--- drivers/vfio/platform/vfio_platform_private.h | 15 +++++ 3 files changed, 111 insertions(+), 23 deletions(-) diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 04f40c5acfd6..82cedcebfd90 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "vfio_platform_private.h" @@ -36,14 +37,11 @@ static int get_platform_irq(struct vfio_platform_device *vdev, int i) return platform_get_irq_optional(pdev, i); } -static int vfio_platform_probe(struct platform_device *pdev) +static int vfio_platform_init_dev(struct vfio_device *core_vdev) { - struct vfio_platform_device *vdev; - int ret; - - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + struct platform_device *pdev = to_platform_device(core_vdev->dev); vdev->opaque = (void *) pdev; vdev->name = pdev->name; @@ -52,24 +50,64 @@ static int vfio_platform_probe(struct platform_device *pdev) vdev->get_irq = get_platform_irq; vdev->reset_required = reset_required; - ret = vfio_platform_probe_common(vdev, &pdev->dev); - if (ret) { - kfree(vdev); - return ret; - } + return vfio_platform_init_common(vdev); +} + +static const struct vfio_device_ops vfio_platform_ops; +static int vfio_platform_probe(struct platform_device *pdev) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = vfio_alloc_device(vfio_platform_device, vdev, &pdev->dev, + &vfio_platform_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_put_vdev; + + pm_runtime_enable(&pdev->dev); dev_set_drvdata(&pdev->dev, vdev); return 0; + +out_put_vdev: + vfio_put_device(&vdev->vdev); + return ret; +} + +static void vfio_platform_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + + vfio_platform_release_common(vdev); + vfio_free_device(core_vdev); } static int vfio_platform_remove(struct platform_device *pdev) { struct vfio_platform_device *vdev = dev_get_drvdata(&pdev->dev); - vfio_platform_remove_common(vdev); - kfree(vdev); + vfio_unregister_group_dev(&vdev->vdev); + pm_runtime_disable(vdev->device); + vfio_put_device(&vdev->vdev); return 0; } +static const struct vfio_device_ops vfio_platform_ops = { + .name = "vfio-platform", + .init = vfio_platform_init_dev, + .release = vfio_platform_release_dev, + .open_device = vfio_platform_open_device, + .close_device = vfio_platform_close_device, + .ioctl = vfio_platform_ioctl, + .read = vfio_platform_read, + .write = vfio_platform_write, + .mmap = vfio_platform_mmap, +}; + static struct platform_driver vfio_platform_driver = { .probe = vfio_platform_probe, .remove = vfio_platform_remove, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 256f55b84e70..4c01bf0adebb 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -218,7 +218,7 @@ static int vfio_platform_call_reset(struct vfio_platform_device *vdev, return -EINVAL; } -static void vfio_platform_close_device(struct vfio_device *core_vdev) +void vfio_platform_close_device(struct vfio_device *core_vdev) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -236,8 +236,9 @@ static void vfio_platform_close_device(struct vfio_device *core_vdev) vfio_platform_regions_cleanup(vdev); vfio_platform_irq_cleanup(vdev); } +EXPORT_SYMBOL_GPL(vfio_platform_close_device); -static int vfio_platform_open_device(struct vfio_device *core_vdev) +int vfio_platform_open_device(struct vfio_device *core_vdev) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -273,9 +274,10 @@ static int vfio_platform_open_device(struct vfio_device *core_vdev) vfio_platform_regions_cleanup(vdev); return ret; } +EXPORT_SYMBOL_GPL(vfio_platform_open_device); -static long vfio_platform_ioctl(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) +long vfio_platform_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -382,6 +384,7 @@ static long vfio_platform_ioctl(struct vfio_device *core_vdev, return -ENOTTY; } +EXPORT_SYMBOL_GPL(vfio_platform_ioctl); static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg, char __user *buf, size_t count, @@ -438,8 +441,8 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg, return -EFAULT; } -static ssize_t vfio_platform_read(struct vfio_device *core_vdev, - char __user *buf, size_t count, loff_t *ppos) +ssize_t vfio_platform_read(struct vfio_device *core_vdev, + char __user *buf, size_t count, loff_t *ppos) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -460,6 +463,7 @@ static ssize_t vfio_platform_read(struct vfio_device *core_vdev, return -EINVAL; } +EXPORT_SYMBOL_GPL(vfio_platform_read); static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg, const char __user *buf, size_t count, @@ -515,8 +519,8 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg, return -EFAULT; } -static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf, - size_t count, loff_t *ppos) +ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __user *buf, + size_t count, loff_t *ppos) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -537,6 +541,7 @@ static ssize_t vfio_platform_write(struct vfio_device *core_vdev, const char __u return -EINVAL; } +EXPORT_SYMBOL_GPL(vfio_platform_write); static int vfio_platform_mmap_mmio(struct vfio_platform_region region, struct vm_area_struct *vma) @@ -558,7 +563,7 @@ static int vfio_platform_mmap_mmio(struct vfio_platform_region region, req_len, vma->vm_page_prot); } -static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) +int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { struct vfio_platform_device *vdev = container_of(core_vdev, struct vfio_platform_device, vdev); @@ -598,6 +603,7 @@ static int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_stru return -EINVAL; } +EXPORT_SYMBOL_GPL(vfio_platform_mmap); static const struct vfio_device_ops vfio_platform_ops = { .name = "vfio-platform", @@ -639,6 +645,35 @@ static int vfio_platform_of_probe(struct vfio_platform_device *vdev, * If the firmware is ACPI type, then acpi_disabled is 0. All other checks are * valid checks. We cannot claim that this system is DT. */ +int vfio_platform_init_common(struct vfio_platform_device *vdev) +{ + int ret; + struct device *dev = vdev->vdev.dev; + + ret = vfio_platform_acpi_probe(vdev, dev); + if (ret) + ret = vfio_platform_of_probe(vdev, dev); + + if (ret) + return ret; + + vdev->device = dev; + mutex_init(&vdev->igate); + + ret = vfio_platform_get_reset(vdev); + if (ret && vdev->reset_required) + dev_err(dev, "No reset function found for device %s\n", + vdev->name); + return ret; +} +EXPORT_SYMBOL_GPL(vfio_platform_init_common); + +void vfio_platform_release_common(struct vfio_platform_device *vdev) +{ + vfio_platform_put_reset(vdev); +} +EXPORT_SYMBOL_GPL(vfio_platform_release_common); + int vfio_platform_probe_common(struct vfio_platform_device *vdev, struct device *dev) { diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index 691b43f4b2b2..a769d649fb97 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -81,6 +81,21 @@ struct vfio_platform_reset_node { int vfio_platform_probe_common(struct vfio_platform_device *vdev, struct device *dev); void vfio_platform_remove_common(struct vfio_platform_device *vdev); +int vfio_platform_init_common(struct vfio_platform_device *vdev); +void vfio_platform_release_common(struct vfio_platform_device *vdev); + +int vfio_platform_open_device(struct vfio_device *core_vdev); +void vfio_platform_close_device(struct vfio_device *core_vdev); +long vfio_platform_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg); +ssize_t vfio_platform_read(struct vfio_device *core_vdev, + char __user *buf, size_t count, + loff_t *ppos); +ssize_t vfio_platform_write(struct vfio_device *core_vdev, + const char __user *buf, + size_t count, loff_t *ppos); +int vfio_platform_mmap(struct vfio_device *core_vdev, + struct vm_area_struct *vma); int vfio_platform_irq_init(struct vfio_platform_device *vdev); void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev); From ac1237912fbd0f2503344aa268ceb43628cdffa8 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:58 +0800 Subject: [PATCH 39/77] vfio/amba: Use the new device life cycle helpers Implement amba's own vfio_device_ops. Remove vfio_platform_probe/remove_common() given no user now. Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-13-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_amba.c | 72 ++++++++++++++----- drivers/vfio/platform/vfio_platform_common.c | 60 ---------------- drivers/vfio/platform/vfio_platform_private.h | 3 - 3 files changed, 55 insertions(+), 80 deletions(-) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 1aaa4f721bd2..eaea63e5294c 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "vfio_platform_private.h" @@ -40,20 +41,16 @@ static int get_amba_irq(struct vfio_platform_device *vdev, int i) return ret ? ret : -ENXIO; } -static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) +static int vfio_amba_init_dev(struct vfio_device *core_vdev) { - struct vfio_platform_device *vdev; + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + struct amba_device *adev = to_amba_device(core_vdev->dev); int ret; - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) - return -ENOMEM; - vdev->name = kasprintf(GFP_KERNEL, "vfio-amba-%08x", adev->periphid); - if (!vdev->name) { - kfree(vdev); + if (!vdev->name) return -ENOMEM; - } vdev->opaque = (void *) adev; vdev->flags = VFIO_DEVICE_FLAGS_AMBA; @@ -61,26 +58,67 @@ static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) vdev->get_irq = get_amba_irq; vdev->reset_required = false; - ret = vfio_platform_probe_common(vdev, &adev->dev); - if (ret) { + ret = vfio_platform_init_common(vdev); + if (ret) kfree(vdev->name); - kfree(vdev); - return ret; - } + return ret; +} +static const struct vfio_device_ops vfio_amba_ops; +static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id) +{ + struct vfio_platform_device *vdev; + int ret; + + vdev = vfio_alloc_device(vfio_platform_device, vdev, &adev->dev, + &vfio_amba_ops); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_put_vdev; + + pm_runtime_enable(&adev->dev); dev_set_drvdata(&adev->dev, vdev); return 0; + +out_put_vdev: + vfio_put_device(&vdev->vdev); + return ret; +} + +static void vfio_amba_release_dev(struct vfio_device *core_vdev) +{ + struct vfio_platform_device *vdev = + container_of(core_vdev, struct vfio_platform_device, vdev); + + vfio_platform_release_common(vdev); + kfree(vdev->name); + vfio_free_device(core_vdev); } static void vfio_amba_remove(struct amba_device *adev) { struct vfio_platform_device *vdev = dev_get_drvdata(&adev->dev); - vfio_platform_remove_common(vdev); - kfree(vdev->name); - kfree(vdev); + vfio_unregister_group_dev(&vdev->vdev); + pm_runtime_disable(vdev->device); + vfio_put_device(&vdev->vdev); } +static const struct vfio_device_ops vfio_amba_ops = { + .name = "vfio-amba", + .init = vfio_amba_init_dev, + .release = vfio_amba_release_dev, + .open_device = vfio_platform_open_device, + .close_device = vfio_platform_close_device, + .ioctl = vfio_platform_ioctl, + .read = vfio_platform_read, + .write = vfio_platform_write, + .mmap = vfio_platform_mmap, +}; + static const struct amba_id pl330_ids[] = { { 0, 0 }, }; diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 4c01bf0adebb..55dc4f43c31e 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -605,16 +605,6 @@ int vfio_platform_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma } EXPORT_SYMBOL_GPL(vfio_platform_mmap); -static const struct vfio_device_ops vfio_platform_ops = { - .name = "vfio-platform", - .open_device = vfio_platform_open_device, - .close_device = vfio_platform_close_device, - .ioctl = vfio_platform_ioctl, - .read = vfio_platform_read, - .write = vfio_platform_write, - .mmap = vfio_platform_mmap, -}; - static int vfio_platform_of_probe(struct vfio_platform_device *vdev, struct device *dev) { @@ -674,56 +664,6 @@ void vfio_platform_release_common(struct vfio_platform_device *vdev) } EXPORT_SYMBOL_GPL(vfio_platform_release_common); -int vfio_platform_probe_common(struct vfio_platform_device *vdev, - struct device *dev) -{ - int ret; - - vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops); - - ret = vfio_platform_acpi_probe(vdev, dev); - if (ret) - ret = vfio_platform_of_probe(vdev, dev); - - if (ret) - goto out_uninit; - - vdev->device = dev; - - ret = vfio_platform_get_reset(vdev); - if (ret && vdev->reset_required) { - dev_err(dev, "No reset function found for device %s\n", - vdev->name); - goto out_uninit; - } - - ret = vfio_register_group_dev(&vdev->vdev); - if (ret) - goto put_reset; - - mutex_init(&vdev->igate); - - pm_runtime_enable(dev); - return 0; - -put_reset: - vfio_platform_put_reset(vdev); -out_uninit: - vfio_uninit_group_dev(&vdev->vdev); - return ret; -} -EXPORT_SYMBOL_GPL(vfio_platform_probe_common); - -void vfio_platform_remove_common(struct vfio_platform_device *vdev) -{ - vfio_unregister_group_dev(&vdev->vdev); - - pm_runtime_disable(vdev->device); - vfio_platform_put_reset(vdev); - vfio_uninit_group_dev(&vdev->vdev); -} -EXPORT_SYMBOL_GPL(vfio_platform_remove_common); - void __vfio_platform_register_reset(struct vfio_platform_reset_node *node) { mutex_lock(&driver_lock); diff --git a/drivers/vfio/platform/vfio_platform_private.h b/drivers/vfio/platform/vfio_platform_private.h index a769d649fb97..8d8fab516849 100644 --- a/drivers/vfio/platform/vfio_platform_private.h +++ b/drivers/vfio/platform/vfio_platform_private.h @@ -78,9 +78,6 @@ struct vfio_platform_reset_node { vfio_platform_reset_fn_t of_reset; }; -int vfio_platform_probe_common(struct vfio_platform_device *vdev, - struct device *dev); -void vfio_platform_remove_common(struct vfio_platform_device *vdev); int vfio_platform_init_common(struct vfio_platform_device *vdev); void vfio_platform_release_common(struct vfio_platform_device *vdev); From ebb72b765fb49685c4603d2bff47a4ab5d2580a9 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:43:59 +0800 Subject: [PATCH 40/77] vfio/ccw: Use the new device life cycle helpers ccw is the only exception which cannot use vfio_alloc_device() because its private device structure is designed to serve both mdev and parent. Life cycle of the parent is managed by css_driver so vfio_ccw_private must be allocated/freed in css_driver probe/remove path instead of conforming to vfio core life cycle for mdev. Given that use a wait/completion scheme so the mdev remove path waits after vfio_put_device() until receiving a completion notification from @release. The completion indicates that all active references on vfio_device have been released. After that point although free of vfio_ccw_private is delayed to css_driver it's at least guaranteed to have no parallel reference on released vfio device part from other code paths. memset() in @probe is removed. vfio_device is either already cleared when probed for the first time or cleared in @release from last probe. The right fix is to introduce separate structures for mdev and parent, but this won't happen in short term per prior discussions. Remove vfio_init/uninit_group_dev() as no user now. Suggested-by: Jason Gunthorpe Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220921104401.38898-14-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 52 +++++++++++++++++++++++++---- drivers/s390/cio/vfio_ccw_private.h | 3 ++ drivers/vfio/vfio_main.c | 23 +++---------- include/linux/vfio.h | 3 -- 4 files changed, 53 insertions(+), 28 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 4a806a2273b5..9f8486c0d3d3 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -87,6 +87,15 @@ static struct attribute_group *mdev_type_groups[] = { NULL, }; +static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) +{ + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); + + init_completion(&private->release_comp); + return 0; +} + static int vfio_ccw_mdev_probe(struct mdev_device *mdev) { struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); @@ -98,9 +107,9 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) if (atomic_dec_if_positive(&private->avail) < 0) return -EPERM; - memset(&private->vdev, 0, sizeof(private->vdev)); - vfio_init_group_dev(&private->vdev, &mdev->dev, - &vfio_ccw_dev_ops); + ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); + if (ret) + return ret; VFIO_CCW_MSG_EVENT(2, "sch %x.%x.%04x: create\n", private->sch->schid.cssid, @@ -109,16 +118,33 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) ret = vfio_register_emulated_iommu_dev(&private->vdev); if (ret) - goto err_atomic; + goto err_put_vdev; dev_set_drvdata(&mdev->dev, private); return 0; -err_atomic: - vfio_uninit_group_dev(&private->vdev); +err_put_vdev: + vfio_put_device(&private->vdev); atomic_inc(&private->avail); return ret; } +static void vfio_ccw_mdev_release_dev(struct vfio_device *vdev) +{ + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); + + /* + * We cannot free vfio_ccw_private here because it includes + * parent info which must be free'ed by css driver. + * + * Use a workaround by memset'ing the core device part and + * then notifying the remove path that all active references + * to this device have been released. + */ + memset(vdev, 0, sizeof(*vdev)); + complete(&private->release_comp); +} + static void vfio_ccw_mdev_remove(struct mdev_device *mdev) { struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); @@ -130,7 +156,17 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) vfio_unregister_group_dev(&private->vdev); - vfio_uninit_group_dev(&private->vdev); + vfio_put_device(&private->vdev); + /* + * Wait for all active references on mdev are released so it + * is safe to defer kfree() to a later point. + * + * TODO: the clean fix is to split parent/mdev info from ccw + * private structure so each can be managed in its own life + * cycle. + */ + wait_for_completion(&private->release_comp); + atomic_inc(&private->avail); } @@ -592,6 +628,8 @@ static void vfio_ccw_mdev_request(struct vfio_device *vdev, unsigned int count) } static const struct vfio_device_ops vfio_ccw_dev_ops = { + .init = vfio_ccw_mdev_init_dev, + .release = vfio_ccw_mdev_release_dev, .open_device = vfio_ccw_mdev_open_device, .close_device = vfio_ccw_mdev_close_device, .read = vfio_ccw_mdev_read, diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index cd24b7fada91..63d9202b29c7 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -88,6 +88,7 @@ struct vfio_ccw_crw { * @req_trigger: eventfd ctx for signaling userspace to return device * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling + * @release_comp: synchronization helper for vfio device release */ struct vfio_ccw_private { struct vfio_device vdev; @@ -113,6 +114,8 @@ struct vfio_ccw_private { struct eventfd_ctx *req_trigger; struct work_struct io_work; struct work_struct crw_work; + + struct completion release_comp; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index b9c6a97d647a..12952858d903 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -483,28 +483,13 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, /* * VFIO driver API */ -void vfio_init_group_dev(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops) -{ - init_completion(&device->comp); - device->dev = dev; - device->ops = ops; -} -EXPORT_SYMBOL_GPL(vfio_init_group_dev); - -void vfio_uninit_group_dev(struct vfio_device *device) -{ - vfio_release_device_set(device); -} -EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); - /* Release helper called by vfio_put_device() */ void vfio_device_release(struct kref *kref) { struct vfio_device *device = container_of(kref, struct vfio_device, kref); - vfio_uninit_group_dev(device); + vfio_release_device_set(device); /* * kvfree() cannot be done here due to a life cycle mess in @@ -562,7 +547,9 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, { int ret; - vfio_init_group_dev(device, dev, ops); + init_completion(&device->comp); + device->dev = dev; + device->ops = ops; if (ops->init) { ret = ops->init(device); @@ -574,7 +561,7 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, return 0; out_uninit: - vfio_uninit_group_dev(device); + vfio_release_device_set(device); return ret; } EXPORT_SYMBOL_GPL(vfio_init_device); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index f67cac700e6f..3cf857b1eec7 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -184,9 +184,6 @@ static inline void vfio_put_device(struct vfio_device *device) kref_put(&device->kref, vfio_device_release); } -void vfio_init_group_dev(struct vfio_device *device, struct device *dev, - const struct vfio_device_ops *ops); -void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); From 4a725b8de4cc5e88c00f7607d9ba0e97151251e5 Mon Sep 17 00:00:00 2001 From: Kevin Tian Date: Wed, 21 Sep 2022 18:44:00 +0800 Subject: [PATCH 41/77] vfio: Rename vfio_device_put() and vfio_device_try_get() With the addition of vfio_put_device() now the names become confusing. vfio_put_device() is clear from object life cycle p.o.v given kref. vfio_device_put()/vfio_device_try_get() are helpers for tracking users on a registered device. Now rename them: - vfio_device_put() -> vfio_device_put_registration() - vfio_device_try_get() -> vfio_device_try_get_registration() Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20220921104401.38898-15-kevin.tian@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 12952858d903..c27449613a1d 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -453,13 +453,13 @@ static void vfio_group_get(struct vfio_group *group) * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -static void vfio_device_put(struct vfio_device *device) +static void vfio_device_put_registration(struct vfio_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } -static bool vfio_device_try_get(struct vfio_device *device) +static bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } @@ -471,7 +471,8 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, mutex_lock(&group->device_lock); list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev && vfio_device_try_get(device)) { + if (device->dev == dev && + vfio_device_try_get_registration(device)) { mutex_unlock(&group->device_lock); return device; } @@ -673,7 +674,7 @@ static int __vfio_register_dev(struct vfio_device *device, if (existing_device) { dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); - vfio_device_put(existing_device); + vfio_device_put_registration(existing_device); if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -731,7 +732,7 @@ static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, ret = !strcmp(dev_name(it->dev), buf); } - if (ret && vfio_device_try_get(it)) { + if (ret && vfio_device_try_get_registration(it)) { device = it; break; } @@ -751,7 +752,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) bool interrupted = false; long rc; - vfio_device_put(device); + vfio_device_put_registration(device); rc = try_wait_for_completion(&device->comp); while (rc <= 0) { if (device->ops->request) @@ -1311,7 +1312,7 @@ static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, err_put_fdno: put_unused_fd(fdno); err_put_device: - vfio_device_put(device); + vfio_device_put_registration(device); return ret; } @@ -1493,7 +1494,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) vfio_device_unassign_container(device); - vfio_device_put(device); + vfio_device_put_registration(device); return 0; } From 3c28a76124b25882411f005924be73795b6ef078 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 21 Sep 2022 18:44:01 +0800 Subject: [PATCH 42/77] vfio: Add struct device to vfio_device and replace kref. With it a 'vfio-dev/vfioX' node is created under the sysfs path of the parent, indicating the device is bound to a vfio driver, e.g.: /sys/devices/pci0000\:6f/0000\:6f\:01.0/vfio-dev/vfio0 It is also a preparatory step toward adding cdev for supporting future device-oriented uAPI. Add Documentation/ABI/testing/sysfs-devices-vfio-dev. Suggested-by: Jason Gunthorpe Signed-off-by: Yi Liu Signed-off-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20220921104401.38898-16-kevin.tian@intel.com Signed-off-by: Alex Williamson --- .../ABI/testing/sysfs-devices-vfio-dev | 8 +++ MAINTAINERS | 1 + drivers/vfio/vfio_main.c | 64 +++++++++++++++---- include/linux/vfio.h | 6 +- 4 files changed, 65 insertions(+), 14 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-devices-vfio-dev diff --git a/Documentation/ABI/testing/sysfs-devices-vfio-dev b/Documentation/ABI/testing/sysfs-devices-vfio-dev new file mode 100644 index 000000000000..e21424fd9666 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-vfio-dev @@ -0,0 +1,8 @@ +What: /sys/...//vfio-dev/vfioX/ +Date: September 2022 +Contact: Yi Liu +Description: + This directory is created when the device is bound to a + vfio driver. The layout under this directory matches what + exists for a standard 'struct device'. 'X' is a unique + index marking this device in vfio. diff --git a/MAINTAINERS b/MAINTAINERS index d30f26e07cd3..02c8f11b1c17 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21312,6 +21312,7 @@ R: Cornelia Huck L: kvm@vger.kernel.org S: Maintained T: git git://github.com/awilliam/linux-vfio.git +F: Documentation/ABI/testing/sysfs-devices-vfio-dev F: Documentation/driver-api/vfio.rst F: drivers/vfio/ F: include/linux/vfio.h diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index c27449613a1d..f9d10dbcf3e6 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -49,6 +49,8 @@ static struct vfio { struct mutex group_lock; /* locks group_list */ struct ida group_ida; dev_t group_devt; + struct class *device_class; + struct ida device_ida; } vfio; struct vfio_iommu_driver { @@ -485,12 +487,13 @@ static struct vfio_device *vfio_group_get_device(struct vfio_group *group, * VFIO driver API */ /* Release helper called by vfio_put_device() */ -void vfio_device_release(struct kref *kref) +static void vfio_device_release(struct device *dev) { struct vfio_device *device = - container_of(kref, struct vfio_device, kref); + container_of(dev, struct vfio_device, device); vfio_release_device_set(device); + ida_free(&vfio.device_ida, device->index); /* * kvfree() cannot be done here due to a life cycle mess in @@ -500,7 +503,6 @@ void vfio_device_release(struct kref *kref) */ device->ops->release(device); } -EXPORT_SYMBOL_GPL(vfio_device_release); /* * Allocate and initialize vfio_device so it can be registered to vfio @@ -548,6 +550,13 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, { int ret; + ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL); + if (ret < 0) { + dev_dbg(dev, "Error to alloc index\n"); + return ret; + } + + device->index = ret; init_completion(&device->comp); device->dev = dev; device->ops = ops; @@ -558,11 +567,15 @@ int vfio_init_device(struct vfio_device *device, struct device *dev, goto out_uninit; } - kref_init(&device->kref); + device_initialize(&device->device); + device->device.release = vfio_device_release; + device->device.class = vfio.device_class; + device->device.parent = device->dev; return 0; out_uninit: vfio_release_device_set(device); + ida_free(&vfio.device_ida, device->index); return ret; } EXPORT_SYMBOL_GPL(vfio_init_device); @@ -659,6 +672,7 @@ static int __vfio_register_dev(struct vfio_device *device, struct vfio_group *group) { struct vfio_device *existing_device; + int ret; if (IS_ERR(group)) return PTR_ERR(group); @@ -675,16 +689,21 @@ static int __vfio_register_dev(struct vfio_device *device, dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); vfio_device_put_registration(existing_device); - if (group->type == VFIO_NO_IOMMU || - group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - vfio_group_put(group); - return -EBUSY; + ret = -EBUSY; + goto err_out; } /* Our reference on group is moved to the device */ device->group = group; + ret = dev_set_name(&device->device, "vfio%d", device->index); + if (ret) + goto err_out; + + ret = device_add(&device->device); + if (ret) + goto err_out; + /* Refcounting can't start until the driver calls register */ refcount_set(&device->refcount, 1); @@ -693,6 +712,12 @@ static int __vfio_register_dev(struct vfio_device *device, mutex_unlock(&group->device_lock); return 0; +err_out: + if (group->type == VFIO_NO_IOMMU || + group->type == VFIO_EMULATED_IOMMU) + iommu_group_remove_device(device->dev); + vfio_group_put(group); + return ret; } int vfio_register_group_dev(struct vfio_device *device) @@ -779,6 +804,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) list_del(&device->group_next); mutex_unlock(&group->device_lock); + /* Balances device_add in register path */ + device_del(&device->device); + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -2362,6 +2390,7 @@ static int __init vfio_init(void) int ret; ida_init(&vfio.group_ida); + ida_init(&vfio.device_ida); mutex_init(&vfio.group_lock); mutex_init(&vfio.iommu_drivers_lock); INIT_LIST_HEAD(&vfio.group_list); @@ -2377,11 +2406,18 @@ static int __init vfio_init(void) vfio.class = class_create(THIS_MODULE, "vfio"); if (IS_ERR(vfio.class)) { ret = PTR_ERR(vfio.class); - goto err_class; + goto err_group_class; } vfio.class->devnode = vfio_devnode; + /* /sys/class/vfio-dev/vfioX */ + vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); + if (IS_ERR(vfio.device_class)) { + ret = PTR_ERR(vfio.device_class); + goto err_dev_class; + } + ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); if (ret) goto err_alloc_chrdev; @@ -2398,9 +2434,12 @@ static int __init vfio_init(void) err_driver_register: unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); err_alloc_chrdev: + class_destroy(vfio.device_class); + vfio.device_class = NULL; +err_dev_class: class_destroy(vfio.class); vfio.class = NULL; -err_class: +err_group_class: misc_deregister(&vfio_dev); return ret; } @@ -2412,8 +2451,11 @@ static void __exit vfio_cleanup(void) #ifdef CONFIG_VFIO_NOIOMMU vfio_unregister_iommu_driver(&vfio_noiommu_ops); #endif + ida_destroy(&vfio.device_ida); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); + class_destroy(vfio.device_class); + vfio.device_class = NULL; class_destroy(vfio.class); vfio.class = NULL; misc_deregister(&vfio_dev); diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 3cf857b1eec7..ee399a768070 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -47,7 +47,8 @@ struct vfio_device { struct kvm *kvm; /* Members below here are private, not for driver use */ - struct kref kref; /* object life cycle */ + unsigned int index; + struct device device; /* device.kref covers object life circle */ refcount_t refcount; /* user count on registered device*/ unsigned int open_count; struct completion comp; @@ -178,10 +179,9 @@ struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, int vfio_init_device(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_free_device(struct vfio_device *device); -void vfio_device_release(struct kref *kref); static inline void vfio_put_device(struct vfio_device *device) { - kref_put(&device->kref, vfio_device_release); + put_device(&device->device); } int vfio_register_group_dev(struct vfio_device *device); From e3bb4de0a0380910180e758a30ccfda65f8e286e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:19 -0300 Subject: [PATCH 43/77] vfio: Add header guards and includes to drivers/vfio/vfio.h As is normal for headers. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 503bea6c843d..093784f1dea7 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -3,6 +3,14 @@ * Copyright (C) 2012 Red Hat, Inc. All rights reserved. * Author: Alex Williamson */ +#ifndef __VFIO_VFIO_H__ +#define __VFIO_VFIO_H__ + +#include +#include +#include + +struct iommu_group; enum vfio_group_type { /* @@ -69,3 +77,5 @@ struct vfio_iommu_driver_ops { int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); + +#endif From 429a781c8e01c24ebb2b9da0a63a14e6fd9e0837 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:20 -0300 Subject: [PATCH 44/77] vfio: Rename __vfio_group_unset_container() To vfio_group_detach_container(). This function is really a container function. Fold the WARN_ON() into it as a precondition assertion. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f9d10dbcf3e6..3d8813125358 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1036,12 +1036,13 @@ static const struct file_operations vfio_fops = { /* * VFIO Group fd, /dev/vfio/$GROUP */ -static void __vfio_group_unset_container(struct vfio_group *group) +static void vfio_group_detach_container(struct vfio_group *group) { struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; lockdep_assert_held_write(&group->group_rwsem); + WARN_ON(group->container_users != 1); down_write(&container->group_lock); @@ -1089,7 +1090,7 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) ret = -EBUSY; goto out_unlock; } - __vfio_group_unset_container(group); + vfio_group_detach_container(group); out_unlock: up_write(&group->group_rwsem); @@ -1441,10 +1442,8 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) * is only called when there are no open devices. */ WARN_ON(group->notifier.head); - if (group->container) { - WARN_ON(group->container_users != 1); - __vfio_group_unset_container(group); - } + if (group->container) + vfio_group_detach_container(group); group->opened_file = NULL; up_write(&group->group_rwsem); From 03e650f6611563c0ccbd0d769d5748fd10d8ee8e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:21 -0300 Subject: [PATCH 45/77] vfio: Split the container logic into vfio_container_attach_group() This splits up the ioctl of vfio_group_ioctl_set_container() so it determines the type of file then invokes a type specific attachment function. Future patches will add iommufd to this function as an alternative type. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 78 ++++++++++++++++++++++++---------------- 1 file changed, 48 insertions(+), 30 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 3d8813125358..879c5d27c712 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1097,41 +1097,30 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) return ret; } -static int vfio_group_ioctl_set_container(struct vfio_group *group, - int __user *arg) +static struct vfio_container *vfio_container_from_file(struct file *file) { - struct fd f; struct vfio_container *container; + + /* Sanity check, is this really our fd? */ + if (file->f_op != &vfio_fops) + return NULL; + + container = file->private_data; + WARN_ON(!container); /* fget ensures we don't race vfio_release */ + return container; +} + +static int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group) +{ struct vfio_iommu_driver *driver; - int container_fd; int ret = 0; + lockdep_assert_held_write(&group->group_rwsem); + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; - if (get_user(container_fd, arg)) - return -EFAULT; - if (container_fd < 0) - return -EINVAL; - f = fdget(container_fd); - if (!f.file) - return -EBADF; - - /* Sanity check, is this really our fd? */ - if (f.file->f_op != &vfio_fops) { - ret = -EINVAL; - goto out_fdput; - } - container = f.file->private_data; - WARN_ON(!container); /* fget ensures we don't race vfio_release */ - - down_write(&group->group_rwsem); - - if (group->container || WARN_ON(group->container_users)) { - ret = -EINVAL; - goto out_unlock_group; - } - down_write(&container->group_lock); /* Real groups and fake groups cannot mix */ @@ -1142,7 +1131,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, } if (group->type == VFIO_IOMMU) { - ret = iommu_group_claim_dma_owner(group->iommu_group, f.file); + ret = iommu_group_claim_dma_owner(group->iommu_group, group); if (ret) goto out_unlock_container; } @@ -1170,9 +1159,38 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, out_unlock_container: up_write(&container->group_lock); -out_unlock_group: + return ret; +} + +static int vfio_group_ioctl_set_container(struct vfio_group *group, + int __user *arg) +{ + struct vfio_container *container; + struct fd f; + int ret; + int fd; + + if (get_user(fd, arg)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + down_write(&group->group_rwsem); + if (group->container || WARN_ON(group->container_users)) { + ret = -EINVAL; + goto out_unlock; + } + container = vfio_container_from_file(f.file); + ret = -EINVAL; + if (container) { + ret = vfio_container_attach_group(container, group); + goto out_unlock; + } + +out_unlock: up_write(&group->group_rwsem); -out_fdput: fdput(f); return ret; } From 444d43ecd01033a758e73b8ae154ee7f3e827f7b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:22 -0300 Subject: [PATCH 46/77] vfio: Remove #ifdefs around CONFIG_VFIO_NOIOMMU This can all be accomplished using typical IS_ENABLED techniques, drop it all. Also rename the variable to vfio_noiommu so this can be made global in following patches. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 43 ++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 879c5d27c712..f79e7eb02931 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -86,10 +86,12 @@ struct vfio_group { }; #ifdef CONFIG_VFIO_NOIOMMU -static bool noiommu __read_mostly; +static bool vfio_noiommu __read_mostly; module_param_named(enable_unsafe_noiommu_mode, - noiommu, bool, S_IRUGO | S_IWUSR); + vfio_noiommu, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); +#else +enum { vfio_noiommu = false }; #endif static DEFINE_XARRAY(vfio_device_set_xa); @@ -166,7 +168,6 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -#ifdef CONFIG_VFIO_NOIOMMU static void *vfio_noiommu_open(unsigned long arg) { if (arg != VFIO_NOIOMMU_IOMMU) @@ -185,7 +186,7 @@ static long vfio_noiommu_ioctl(void *iommu_data, unsigned int cmd, unsigned long arg) { if (cmd == VFIO_CHECK_EXTENSION) - return noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; + return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; return -ENOTTY; } @@ -215,18 +216,13 @@ static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { * Only noiommu containers can use vfio-noiommu and noiommu containers can only * use vfio-noiommu. */ -static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, - const struct vfio_iommu_driver *driver) +static bool vfio_iommu_driver_allowed(struct vfio_container *container, + const struct vfio_iommu_driver *driver) { + if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + return true; return container->noiommu == (driver->ops == &vfio_noiommu_ops); } -#else -static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, - const struct vfio_iommu_driver *driver) -{ - return true; -} -#endif /* CONFIG_VFIO_NOIOMMU */ /* * IOMMU driver registration @@ -630,8 +626,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) struct vfio_group *group; iommu_group = iommu_group_get(dev); -#ifdef CONFIG_VFIO_NOIOMMU - if (!iommu_group && noiommu) { + if (!iommu_group && vfio_noiommu) { /* * With noiommu enabled, create an IOMMU group for devices that * don't already have one, implying no IOMMU hardware/driver @@ -645,7 +640,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) } return group; } -#endif + if (!iommu_group) return ERR_PTR(-EINVAL); @@ -2439,11 +2434,11 @@ static int __init vfio_init(void) if (ret) goto err_alloc_chrdev; -#ifdef CONFIG_VFIO_NOIOMMU - ret = vfio_register_iommu_driver(&vfio_noiommu_ops); -#endif - if (ret) - goto err_driver_register; + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { + ret = vfio_register_iommu_driver(&vfio_noiommu_ops); + if (ret) + goto err_driver_register; + } pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; @@ -2465,9 +2460,9 @@ static void __exit vfio_cleanup(void) { WARN_ON(!list_empty(&vfio.group_list)); -#ifdef CONFIG_VFIO_NOIOMMU - vfio_unregister_iommu_driver(&vfio_noiommu_ops); -#endif + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + vfio_unregister_iommu_driver(&vfio_noiommu_ops); + ida_destroy(&vfio.device_ida); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); From c41da4622e08f874ab02e12eb6b6aaa9ac21daa7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:23 -0300 Subject: [PATCH 47/77] vfio: Split out container code from the init/cleanup functions This miscdev, noiommu driver and a couple of globals are all container items. Move this init into its own functions. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 54 ++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f79e7eb02931..3cb52e9ab035 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -2397,15 +2397,11 @@ static struct miscdevice vfio_dev = { .mode = S_IRUGO | S_IWUGO, }; -static int __init vfio_init(void) +static int __init vfio_container_init(void) { int ret; - ida_init(&vfio.group_ida); - ida_init(&vfio.device_ida); - mutex_init(&vfio.group_lock); mutex_init(&vfio.iommu_drivers_lock); - INIT_LIST_HEAD(&vfio.group_list); INIT_LIST_HEAD(&vfio.iommu_drivers_list); ret = misc_register(&vfio_dev); @@ -2414,6 +2410,39 @@ static int __init vfio_init(void) return ret; } + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { + ret = vfio_register_iommu_driver(&vfio_noiommu_ops); + if (ret) + goto err_misc; + } + return 0; + +err_misc: + misc_deregister(&vfio_dev); + return ret; +} + +static void vfio_container_cleanup(void) +{ + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + vfio_unregister_iommu_driver(&vfio_noiommu_ops); + misc_deregister(&vfio_dev); + mutex_destroy(&vfio.iommu_drivers_lock); +} + +static int __init vfio_init(void) +{ + int ret; + + ida_init(&vfio.group_ida); + ida_init(&vfio.device_ida); + mutex_init(&vfio.group_lock); + INIT_LIST_HEAD(&vfio.group_list); + + ret = vfio_container_init(); + if (ret) + return ret; + /* /dev/vfio/$GROUP */ vfio.class = class_create(THIS_MODULE, "vfio"); if (IS_ERR(vfio.class)) { @@ -2434,17 +2463,9 @@ static int __init vfio_init(void) if (ret) goto err_alloc_chrdev; - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { - ret = vfio_register_iommu_driver(&vfio_noiommu_ops); - if (ret) - goto err_driver_register; - } - pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; -err_driver_register: - unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); err_alloc_chrdev: class_destroy(vfio.device_class); vfio.device_class = NULL; @@ -2452,7 +2473,7 @@ static int __init vfio_init(void) class_destroy(vfio.class); vfio.class = NULL; err_group_class: - misc_deregister(&vfio_dev); + vfio_container_cleanup(); return ret; } @@ -2460,17 +2481,14 @@ static void __exit vfio_cleanup(void) { WARN_ON(!list_empty(&vfio.group_list)); - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) - vfio_unregister_iommu_driver(&vfio_noiommu_ops); - ida_destroy(&vfio.device_ida); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); class_destroy(vfio.device_class); vfio.device_class = NULL; class_destroy(vfio.class); + vfio_container_cleanup(); vfio.class = NULL; - misc_deregister(&vfio_dev); xa_destroy(&vfio_device_set_xa); } From 1408640d578887d7860737221043d91fc6d5a723 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:24 -0300 Subject: [PATCH 48/77] vfio: Rename vfio_ioctl_check_extension() To vfio_container_ioctl_check_extension(). A following patch will turn this into a non-static function, make it clear it is related to the container. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/6-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 3cb52e9ab035..33e55e40c416 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -813,8 +813,9 @@ EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); /* * VFIO base fd, /dev/vfio/vfio */ -static long vfio_ioctl_check_extension(struct vfio_container *container, - unsigned long arg) +static long +vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) { struct vfio_iommu_driver *driver; long ret = 0; @@ -971,7 +972,7 @@ static long vfio_fops_unl_ioctl(struct file *filep, ret = VFIO_API_VERSION; break; case VFIO_CHECK_EXTENSION: - ret = vfio_ioctl_check_extension(container, arg); + ret = vfio_container_ioctl_check_extension(container, arg); break; case VFIO_SET_IOMMU: ret = vfio_ioctl_set_iommu(container, arg); @@ -2100,8 +2101,8 @@ bool vfio_file_enforced_coherent(struct file *file) down_read(&group->group_rwsem); if (group->container) { - ret = vfio_ioctl_check_extension(group->container, - VFIO_DMA_CC_IOMMU); + ret = vfio_container_ioctl_check_extension(group->container, + VFIO_DMA_CC_IOMMU); } else { /* * Since the coherency state is determined only once a container From 9446162e740aefff95c324ac0887f0b68c739695 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:25 -0300 Subject: [PATCH 49/77] vfio: Split the register_device ops call into functions This is a container item. A following patch will move the vfio_container functions to their own .c file. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/7-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 33e55e40c416..1ac7160f9329 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1226,9 +1226,28 @@ static void vfio_device_unassign_container(struct vfio_device *device) up_write(&device->group->group_rwsem); } +static void vfio_device_container_register(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->register_device) + iommu_driver->ops->register_device( + device->group->container->iommu_data, device); +} + +static void vfio_device_container_unregister(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->unregister_device) + iommu_driver->ops->unregister_device( + device->group->container->iommu_data, device); +} + static struct file *vfio_device_open(struct vfio_device *device) { - struct vfio_iommu_driver *iommu_driver; struct file *filep; int ret; @@ -1259,12 +1278,7 @@ static struct file *vfio_device_open(struct vfio_device *device) if (ret) goto err_undo_count; } - - iommu_driver = device->group->container->iommu_driver; - if (iommu_driver && iommu_driver->ops->register_device) - iommu_driver->ops->register_device( - device->group->container->iommu_data, device); - + vfio_device_container_register(device); up_read(&device->group->group_rwsem); } mutex_unlock(&device->dev_set->lock); @@ -1302,10 +1316,7 @@ static struct file *vfio_device_open(struct vfio_device *device) if (device->open_count == 1 && device->ops->close_device) { device->ops->close_device(device); - iommu_driver = device->group->container->iommu_driver; - if (iommu_driver && iommu_driver->ops->unregister_device) - iommu_driver->ops->unregister_device( - device->group->container->iommu_data, device); + vfio_device_container_unregister(device); } err_undo_count: up_read(&device->group->group_rwsem); @@ -1513,7 +1524,6 @@ static inline void vfio_device_pm_runtime_put(struct vfio_device *device) static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; - struct vfio_iommu_driver *iommu_driver; mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); @@ -1521,10 +1531,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); - iommu_driver = device->group->container->iommu_driver; - if (iommu_driver && iommu_driver->ops->unregister_device) - iommu_driver->ops->unregister_device( - device->group->container->iommu_data, device); + vfio_device_container_unregister(device); up_read(&device->group->group_rwsem); device->open_count--; if (device->open_count == 0) From cdc71fe4ecbf48f7292ae8b7e4ff4a2a8b5bdbca Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 16:20:26 -0300 Subject: [PATCH 50/77] vfio: Move container code into drivers/vfio/container.c All the functions that dereference struct vfio_container are moved into container.c. Simple code motion, no functional change. Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/8-v3-297af71838d2+b9-vfio_container_split_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/Makefile | 1 + drivers/vfio/container.c | 680 ++++++++++++++++++++++++++++++++++++++ drivers/vfio/vfio.h | 46 +++ drivers/vfio/vfio_main.c | 692 +-------------------------------------- 4 files changed, 728 insertions(+), 691 deletions(-) create mode 100644 drivers/vfio/container.c diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index d67c604d0407..b693a1169286 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ iova_bitmap.o \ + container.o obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c new file mode 100644 index 000000000000..db7c071ee3de --- /dev/null +++ b/drivers/vfio/container.c @@ -0,0 +1,680 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * + * VFIO container (/dev/vfio/vfio) + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vfio.h" + +struct vfio_container { + struct kref kref; + struct list_head group_list; + struct rw_semaphore group_lock; + struct vfio_iommu_driver *iommu_driver; + void *iommu_data; + bool noiommu; +}; + +static struct vfio { + struct list_head iommu_drivers_list; + struct mutex iommu_drivers_lock; +} vfio; + +#ifdef CONFIG_VFIO_NOIOMMU +bool vfio_noiommu __read_mostly; +module_param_named(enable_unsafe_noiommu_mode, + vfio_noiommu, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); +#endif + +static void *vfio_noiommu_open(unsigned long arg) +{ + if (arg != VFIO_NOIOMMU_IOMMU) + return ERR_PTR(-EINVAL); + if (!capable(CAP_SYS_RAWIO)) + return ERR_PTR(-EPERM); + + return NULL; +} + +static void vfio_noiommu_release(void *iommu_data) +{ +} + +static long vfio_noiommu_ioctl(void *iommu_data, + unsigned int cmd, unsigned long arg) +{ + if (cmd == VFIO_CHECK_EXTENSION) + return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; + + return -ENOTTY; +} + +static int vfio_noiommu_attach_group(void *iommu_data, + struct iommu_group *iommu_group, enum vfio_group_type type) +{ + return 0; +} + +static void vfio_noiommu_detach_group(void *iommu_data, + struct iommu_group *iommu_group) +{ +} + +static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { + .name = "vfio-noiommu", + .owner = THIS_MODULE, + .open = vfio_noiommu_open, + .release = vfio_noiommu_release, + .ioctl = vfio_noiommu_ioctl, + .attach_group = vfio_noiommu_attach_group, + .detach_group = vfio_noiommu_detach_group, +}; + +/* + * Only noiommu containers can use vfio-noiommu and noiommu containers can only + * use vfio-noiommu. + */ +static bool vfio_iommu_driver_allowed(struct vfio_container *container, + const struct vfio_iommu_driver *driver) +{ + if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + return true; + return container->noiommu == (driver->ops == &vfio_noiommu_ops); +} + +/* + * IOMMU driver registration + */ +int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) +{ + struct vfio_iommu_driver *driver, *tmp; + + if (WARN_ON(!ops->register_device != !ops->unregister_device)) + return -EINVAL; + + driver = kzalloc(sizeof(*driver), GFP_KERNEL); + if (!driver) + return -ENOMEM; + + driver->ops = ops; + + mutex_lock(&vfio.iommu_drivers_lock); + + /* Check for duplicates */ + list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { + if (tmp->ops == ops) { + mutex_unlock(&vfio.iommu_drivers_lock); + kfree(driver); + return -EINVAL; + } + } + + list_add(&driver->vfio_next, &vfio.iommu_drivers_list); + + mutex_unlock(&vfio.iommu_drivers_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); + +void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) +{ + struct vfio_iommu_driver *driver; + + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { + if (driver->ops == ops) { + list_del(&driver->vfio_next); + mutex_unlock(&vfio.iommu_drivers_lock); + kfree(driver); + return; + } + } + mutex_unlock(&vfio.iommu_drivers_lock); +} +EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); + +/* + * Container objects - containers are created when /dev/vfio/vfio is + * opened, but their lifecycle extends until the last user is done, so + * it's freed via kref. Must support container/group/device being + * closed in any order. + */ +static void vfio_container_release(struct kref *kref) +{ + struct vfio_container *container; + container = container_of(kref, struct vfio_container, kref); + + kfree(container); +} + +static void vfio_container_get(struct vfio_container *container) +{ + kref_get(&container->kref); +} + +static void vfio_container_put(struct vfio_container *container) +{ + kref_put(&container->kref, vfio_container_release); +} + +void vfio_device_container_register(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->register_device) + iommu_driver->ops->register_device( + device->group->container->iommu_data, device); +} + +void vfio_device_container_unregister(struct vfio_device *device) +{ + struct vfio_iommu_driver *iommu_driver = + device->group->container->iommu_driver; + + if (iommu_driver && iommu_driver->ops->unregister_device) + iommu_driver->ops->unregister_device( + device->group->container->iommu_data, device); +} + +long vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_iommu_driver *driver; + long ret = 0; + + down_read(&container->group_lock); + + driver = container->iommu_driver; + + switch (arg) { + /* No base extensions yet */ + default: + /* + * If no driver is set, poll all registered drivers for + * extensions and return the first positive result. If + * a driver is already set, further queries will be passed + * only to that driver. + */ + if (!driver) { + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, + vfio_next) { + + if (!list_empty(&container->group_list) && + !vfio_iommu_driver_allowed(container, + driver)) + continue; + if (!try_module_get(driver->ops->owner)) + continue; + + ret = driver->ops->ioctl(NULL, + VFIO_CHECK_EXTENSION, + arg); + module_put(driver->ops->owner); + if (ret > 0) + break; + } + mutex_unlock(&vfio.iommu_drivers_lock); + } else + ret = driver->ops->ioctl(container->iommu_data, + VFIO_CHECK_EXTENSION, arg); + } + + up_read(&container->group_lock); + + return ret; +} + +/* hold write lock on container->group_lock */ +static int __vfio_container_attach_groups(struct vfio_container *container, + struct vfio_iommu_driver *driver, + void *data) +{ + struct vfio_group *group; + int ret = -ENODEV; + + list_for_each_entry(group, &container->group_list, container_next) { + ret = driver->ops->attach_group(data, group->iommu_group, + group->type); + if (ret) + goto unwind; + } + + return ret; + +unwind: + list_for_each_entry_continue_reverse(group, &container->group_list, + container_next) { + driver->ops->detach_group(data, group->iommu_group); + } + + return ret; +} + +static long vfio_ioctl_set_iommu(struct vfio_container *container, + unsigned long arg) +{ + struct vfio_iommu_driver *driver; + long ret = -ENODEV; + + down_write(&container->group_lock); + + /* + * The container is designed to be an unprivileged interface while + * the group can be assigned to specific users. Therefore, only by + * adding a group to a container does the user get the privilege of + * enabling the iommu, which may allocate finite resources. There + * is no unset_iommu, but by removing all the groups from a container, + * the container is deprivileged and returns to an unset state. + */ + if (list_empty(&container->group_list) || container->iommu_driver) { + up_write(&container->group_lock); + return -EINVAL; + } + + mutex_lock(&vfio.iommu_drivers_lock); + list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { + void *data; + + if (!vfio_iommu_driver_allowed(container, driver)) + continue; + if (!try_module_get(driver->ops->owner)) + continue; + + /* + * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, + * so test which iommu driver reported support for this + * extension and call open on them. We also pass them the + * magic, allowing a single driver to support multiple + * interfaces if they'd like. + */ + if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { + module_put(driver->ops->owner); + continue; + } + + data = driver->ops->open(arg); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + module_put(driver->ops->owner); + continue; + } + + ret = __vfio_container_attach_groups(container, driver, data); + if (ret) { + driver->ops->release(data); + module_put(driver->ops->owner); + continue; + } + + container->iommu_driver = driver; + container->iommu_data = data; + break; + } + + mutex_unlock(&vfio.iommu_drivers_lock); + up_write(&container->group_lock); + + return ret; +} + +static long vfio_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver; + void *data; + long ret = -EINVAL; + + if (!container) + return ret; + + switch (cmd) { + case VFIO_GET_API_VERSION: + ret = VFIO_API_VERSION; + break; + case VFIO_CHECK_EXTENSION: + ret = vfio_container_ioctl_check_extension(container, arg); + break; + case VFIO_SET_IOMMU: + ret = vfio_ioctl_set_iommu(container, arg); + break; + default: + driver = container->iommu_driver; + data = container->iommu_data; + + if (driver) /* passthrough all unrecognized ioctls */ + ret = driver->ops->ioctl(data, cmd, arg); + } + + return ret; +} + +static int vfio_fops_open(struct inode *inode, struct file *filep) +{ + struct vfio_container *container; + + container = kzalloc(sizeof(*container), GFP_KERNEL); + if (!container) + return -ENOMEM; + + INIT_LIST_HEAD(&container->group_list); + init_rwsem(&container->group_lock); + kref_init(&container->kref); + + filep->private_data = container; + + return 0; +} + +static int vfio_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_container *container = filep->private_data; + struct vfio_iommu_driver *driver = container->iommu_driver; + + if (driver && driver->ops->notify) + driver->ops->notify(container->iommu_data, + VFIO_IOMMU_CONTAINER_CLOSE); + + filep->private_data = NULL; + + vfio_container_put(container); + + return 0; +} + +static const struct file_operations vfio_fops = { + .owner = THIS_MODULE, + .open = vfio_fops_open, + .release = vfio_fops_release, + .unlocked_ioctl = vfio_fops_unl_ioctl, + .compat_ioctl = compat_ptr_ioctl, +}; + +struct vfio_container *vfio_container_from_file(struct file *file) +{ + struct vfio_container *container; + + /* Sanity check, is this really our fd? */ + if (file->f_op != &vfio_fops) + return NULL; + + container = file->private_data; + WARN_ON(!container); /* fget ensures we don't race vfio_release */ + return container; +} + +static struct miscdevice vfio_dev = { + .minor = VFIO_MINOR, + .name = "vfio", + .fops = &vfio_fops, + .nodename = "vfio/vfio", + .mode = S_IRUGO | S_IWUGO, +}; + +int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group) +{ + struct vfio_iommu_driver *driver; + int ret = 0; + + lockdep_assert_held_write(&group->group_rwsem); + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + down_write(&container->group_lock); + + /* Real groups and fake groups cannot mix */ + if (!list_empty(&container->group_list) && + container->noiommu != (group->type == VFIO_NO_IOMMU)) { + ret = -EPERM; + goto out_unlock_container; + } + + if (group->type == VFIO_IOMMU) { + ret = iommu_group_claim_dma_owner(group->iommu_group, group); + if (ret) + goto out_unlock_container; + } + + driver = container->iommu_driver; + if (driver) { + ret = driver->ops->attach_group(container->iommu_data, + group->iommu_group, + group->type); + if (ret) { + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner( + group->iommu_group); + goto out_unlock_container; + } + } + + group->container = container; + group->container_users = 1; + container->noiommu = (group->type == VFIO_NO_IOMMU); + list_add(&group->container_next, &container->group_list); + + /* Get a reference on the container and mark a user within the group */ + vfio_container_get(container); + +out_unlock_container: + up_write(&container->group_lock); + return ret; +} + +void vfio_group_detach_container(struct vfio_group *group) +{ + struct vfio_container *container = group->container; + struct vfio_iommu_driver *driver; + + lockdep_assert_held_write(&group->group_rwsem); + WARN_ON(group->container_users != 1); + + down_write(&container->group_lock); + + driver = container->iommu_driver; + if (driver) + driver->ops->detach_group(container->iommu_data, + group->iommu_group); + + if (group->type == VFIO_IOMMU) + iommu_group_release_dma_owner(group->iommu_group); + + group->container = NULL; + group->container_users = 0; + list_del(&group->container_next); + + /* Detaching the last group deprivileges a container, remove iommu */ + if (driver && list_empty(&container->group_list)) { + driver->ops->release(container->iommu_data); + module_put(driver->ops->owner); + container->iommu_driver = NULL; + container->iommu_data = NULL; + } + + up_write(&container->group_lock); + + vfio_container_put(container); +} + +int vfio_device_assign_container(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + lockdep_assert_held_write(&group->group_rwsem); + + if (!group->container || !group->container->iommu_driver || + WARN_ON(!group->container_users)) + return -EINVAL; + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + get_file(group->opened_file); + group->container_users++; + return 0; +} + +void vfio_device_unassign_container(struct vfio_device *device) +{ + down_write(&device->group->group_rwsem); + WARN_ON(device->group->container_users <= 1); + device->group->container_users--; + fput(device->group->opened_file); + up_write(&device->group->group_rwsem); +} + +/* + * Pin contiguous user pages and return their associated host pages for local + * domain only. + * @device [in] : device + * @iova [in] : starting IOVA of user pages to be pinned. + * @npage [in] : count of pages to be pinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * @prot [in] : protection flags + * @pages[out] : array of host pages + * Return error or number of pages pinned. + * + * A driver may only call this function if the vfio_device was created + * by vfio_register_emulated_iommu_dev(). + */ +int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, + int npage, int prot, struct page **pages) +{ + struct vfio_container *container; + struct vfio_group *group = device->group; + struct vfio_iommu_driver *driver; + int ret; + + if (!pages || !npage || !vfio_assert_device_open(device)) + return -EINVAL; + + if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) + return -E2BIG; + + /* group->container cannot change while a vfio device is open */ + container = group->container; + driver = container->iommu_driver; + if (likely(driver && driver->ops->pin_pages)) + ret = driver->ops->pin_pages(container->iommu_data, + group->iommu_group, iova, + npage, prot, pages); + else + ret = -ENOTTY; + + return ret; +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin contiguous host pages for local domain only. + * @device [in] : device + * @iova [in] : starting address of user pages to be unpinned. + * @npage [in] : count of pages to be unpinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + */ +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + + if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) + return; + + if (WARN_ON(!vfio_assert_device_open(device))) + return; + + /* group->container cannot change while a vfio device is open */ + container = device->group->container; + driver = container->iommu_driver; + + driver->ops->unpin_pages(container->iommu_data, iova, npage); +} +EXPORT_SYMBOL(vfio_unpin_pages); + +/* + * This interface allows the CPUs to perform some sort of virtual DMA on + * behalf of the device. + * + * CPUs read/write from/into a range of IOVAs pointing to user space memory + * into/from a kernel buffer. + * + * As the read/write of user space memory is conducted via the CPUs and is + * not a real device DMA, it is not necessary to pin the user space memory. + * + * @device [in] : VFIO device + * @iova [in] : base IOVA of a user space buffer + * @data [in] : pointer to kernel buffer + * @len [in] : kernel buffer length + * @write : indicate read or write + * Return error code on failure or 0 on success. + */ +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, + size_t len, bool write) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + int ret = 0; + + if (!data || len <= 0 || !vfio_assert_device_open(device)) + return -EINVAL; + + /* group->container cannot change while a vfio device is open */ + container = device->group->container; + driver = container->iommu_driver; + + if (likely(driver && driver->ops->dma_rw)) + ret = driver->ops->dma_rw(container->iommu_data, + iova, data, len, write); + else + ret = -ENOTTY; + return ret; +} +EXPORT_SYMBOL(vfio_dma_rw); + +int __init vfio_container_init(void) +{ + int ret; + + mutex_init(&vfio.iommu_drivers_lock); + INIT_LIST_HEAD(&vfio.iommu_drivers_list); + + ret = misc_register(&vfio_dev); + if (ret) { + pr_err("vfio: misc device register failed\n"); + return ret; + } + + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { + ret = vfio_register_iommu_driver(&vfio_noiommu_ops); + if (ret) + goto err_misc; + } + return 0; + +err_misc: + misc_deregister(&vfio_dev); + return ret; +} + +void vfio_container_cleanup(void) +{ + if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) + vfio_unregister_iommu_driver(&vfio_noiommu_ops); + misc_deregister(&vfio_dev); + mutex_destroy(&vfio.iommu_drivers_lock); +} diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 093784f1dea7..56fab31f8e0f 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -11,6 +11,8 @@ #include struct iommu_group; +struct vfio_device; +struct vfio_container; enum vfio_group_type { /* @@ -36,6 +38,24 @@ enum vfio_group_type { VFIO_NO_IOMMU, }; +struct vfio_group { + struct device dev; + struct cdev cdev; + refcount_t users; + unsigned int container_users; + struct iommu_group *iommu_group; + struct vfio_container *container; + struct list_head device_list; + struct mutex device_lock; + struct list_head vfio_next; + struct list_head container_next; + enum vfio_group_type type; + struct rw_semaphore group_rwsem; + struct kvm *kvm; + struct file *opened_file; + struct blocking_notifier_head notifier; +}; + /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, @@ -75,7 +95,33 @@ struct vfio_iommu_driver_ops { enum vfio_iommu_notify_type event); }; +struct vfio_iommu_driver { + const struct vfio_iommu_driver_ops *ops; + struct list_head vfio_next; +}; + int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); +bool vfio_assert_device_open(struct vfio_device *device); + +struct vfio_container *vfio_container_from_file(struct file *filep); +int vfio_device_assign_container(struct vfio_device *device); +void vfio_device_unassign_container(struct vfio_device *device); +int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group); +void vfio_group_detach_container(struct vfio_group *group); +void vfio_device_container_register(struct vfio_device *device); +void vfio_device_container_unregister(struct vfio_device *device); +long vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg); +int __init vfio_container_init(void); +void vfio_container_cleanup(void); + +#ifdef CONFIG_VFIO_NOIOMMU +extern bool vfio_noiommu __read_mostly; +#else +enum { vfio_noiommu = false }; +#endif + #endif diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 1ac7160f9329..af5945c71c41 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -43,8 +43,6 @@ static struct vfio { struct class *class; - struct list_head iommu_drivers_list; - struct mutex iommu_drivers_lock; struct list_head group_list; struct mutex group_lock; /* locks group_list */ struct ida group_ida; @@ -53,47 +51,6 @@ static struct vfio { struct ida device_ida; } vfio; -struct vfio_iommu_driver { - const struct vfio_iommu_driver_ops *ops; - struct list_head vfio_next; -}; - -struct vfio_container { - struct kref kref; - struct list_head group_list; - struct rw_semaphore group_lock; - struct vfio_iommu_driver *iommu_driver; - void *iommu_data; - bool noiommu; -}; - -struct vfio_group { - struct device dev; - struct cdev cdev; - refcount_t users; - unsigned int container_users; - struct iommu_group *iommu_group; - struct vfio_container *container; - struct list_head device_list; - struct mutex device_lock; - struct list_head vfio_next; - struct list_head container_next; - enum vfio_group_type type; - struct rw_semaphore group_rwsem; - struct kvm *kvm; - struct file *opened_file; - struct blocking_notifier_head notifier; -}; - -#ifdef CONFIG_VFIO_NOIOMMU -static bool vfio_noiommu __read_mostly; -module_param_named(enable_unsafe_noiommu_mode, - vfio_noiommu, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)"); -#else -enum { vfio_noiommu = false }; -#endif - static DEFINE_XARRAY(vfio_device_set_xa); static const struct file_operations vfio_group_fops; @@ -168,140 +125,8 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -static void *vfio_noiommu_open(unsigned long arg) -{ - if (arg != VFIO_NOIOMMU_IOMMU) - return ERR_PTR(-EINVAL); - if (!capable(CAP_SYS_RAWIO)) - return ERR_PTR(-EPERM); - - return NULL; -} - -static void vfio_noiommu_release(void *iommu_data) -{ -} - -static long vfio_noiommu_ioctl(void *iommu_data, - unsigned int cmd, unsigned long arg) -{ - if (cmd == VFIO_CHECK_EXTENSION) - return vfio_noiommu && (arg == VFIO_NOIOMMU_IOMMU) ? 1 : 0; - - return -ENOTTY; -} - -static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group, enum vfio_group_type type) -{ - return 0; -} - -static void vfio_noiommu_detach_group(void *iommu_data, - struct iommu_group *iommu_group) -{ -} - -static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { - .name = "vfio-noiommu", - .owner = THIS_MODULE, - .open = vfio_noiommu_open, - .release = vfio_noiommu_release, - .ioctl = vfio_noiommu_ioctl, - .attach_group = vfio_noiommu_attach_group, - .detach_group = vfio_noiommu_detach_group, -}; - -/* - * Only noiommu containers can use vfio-noiommu and noiommu containers can only - * use vfio-noiommu. - */ -static bool vfio_iommu_driver_allowed(struct vfio_container *container, - const struct vfio_iommu_driver *driver) -{ - if (!IS_ENABLED(CONFIG_VFIO_NOIOMMU)) - return true; - return container->noiommu == (driver->ops == &vfio_noiommu_ops); -} - -/* - * IOMMU driver registration - */ -int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops) -{ - struct vfio_iommu_driver *driver, *tmp; - - if (WARN_ON(!ops->register_device != !ops->unregister_device)) - return -EINVAL; - - driver = kzalloc(sizeof(*driver), GFP_KERNEL); - if (!driver) - return -ENOMEM; - - driver->ops = ops; - - mutex_lock(&vfio.iommu_drivers_lock); - - /* Check for duplicates */ - list_for_each_entry(tmp, &vfio.iommu_drivers_list, vfio_next) { - if (tmp->ops == ops) { - mutex_unlock(&vfio.iommu_drivers_lock); - kfree(driver); - return -EINVAL; - } - } - - list_add(&driver->vfio_next, &vfio.iommu_drivers_list); - - mutex_unlock(&vfio.iommu_drivers_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(vfio_register_iommu_driver); - -void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) -{ - struct vfio_iommu_driver *driver; - - mutex_lock(&vfio.iommu_drivers_lock); - list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { - if (driver->ops == ops) { - list_del(&driver->vfio_next); - mutex_unlock(&vfio.iommu_drivers_lock); - kfree(driver); - return; - } - } - mutex_unlock(&vfio.iommu_drivers_lock); -} -EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); - static void vfio_group_get(struct vfio_group *group); -/* - * Container objects - containers are created when /dev/vfio/vfio is - * opened, but their lifecycle extends until the last user is done, so - * it's freed via kref. Must support container/group/device being - * closed in any order. - */ -static void vfio_container_get(struct vfio_container *container) -{ - kref_get(&container->kref); -} - -static void vfio_container_release(struct kref *kref) -{ - struct vfio_container *container; - container = container_of(kref, struct vfio_container, kref); - - kfree(container); -} - -static void vfio_container_put(struct vfio_container *container) -{ - kref_put(&container->kref, vfio_container_release); -} - /* * Group objects - create, release, get, put, search */ @@ -810,263 +635,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); -/* - * VFIO base fd, /dev/vfio/vfio - */ -static long -vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg) -{ - struct vfio_iommu_driver *driver; - long ret = 0; - - down_read(&container->group_lock); - - driver = container->iommu_driver; - - switch (arg) { - /* No base extensions yet */ - default: - /* - * If no driver is set, poll all registered drivers for - * extensions and return the first positive result. If - * a driver is already set, further queries will be passed - * only to that driver. - */ - if (!driver) { - mutex_lock(&vfio.iommu_drivers_lock); - list_for_each_entry(driver, &vfio.iommu_drivers_list, - vfio_next) { - - if (!list_empty(&container->group_list) && - !vfio_iommu_driver_allowed(container, - driver)) - continue; - if (!try_module_get(driver->ops->owner)) - continue; - - ret = driver->ops->ioctl(NULL, - VFIO_CHECK_EXTENSION, - arg); - module_put(driver->ops->owner); - if (ret > 0) - break; - } - mutex_unlock(&vfio.iommu_drivers_lock); - } else - ret = driver->ops->ioctl(container->iommu_data, - VFIO_CHECK_EXTENSION, arg); - } - - up_read(&container->group_lock); - - return ret; -} - -/* hold write lock on container->group_lock */ -static int __vfio_container_attach_groups(struct vfio_container *container, - struct vfio_iommu_driver *driver, - void *data) -{ - struct vfio_group *group; - int ret = -ENODEV; - - list_for_each_entry(group, &container->group_list, container_next) { - ret = driver->ops->attach_group(data, group->iommu_group, - group->type); - if (ret) - goto unwind; - } - - return ret; - -unwind: - list_for_each_entry_continue_reverse(group, &container->group_list, - container_next) { - driver->ops->detach_group(data, group->iommu_group); - } - - return ret; -} - -static long vfio_ioctl_set_iommu(struct vfio_container *container, - unsigned long arg) -{ - struct vfio_iommu_driver *driver; - long ret = -ENODEV; - - down_write(&container->group_lock); - - /* - * The container is designed to be an unprivileged interface while - * the group can be assigned to specific users. Therefore, only by - * adding a group to a container does the user get the privilege of - * enabling the iommu, which may allocate finite resources. There - * is no unset_iommu, but by removing all the groups from a container, - * the container is deprivileged and returns to an unset state. - */ - if (list_empty(&container->group_list) || container->iommu_driver) { - up_write(&container->group_lock); - return -EINVAL; - } - - mutex_lock(&vfio.iommu_drivers_lock); - list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { - void *data; - - if (!vfio_iommu_driver_allowed(container, driver)) - continue; - if (!try_module_get(driver->ops->owner)) - continue; - - /* - * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION, - * so test which iommu driver reported support for this - * extension and call open on them. We also pass them the - * magic, allowing a single driver to support multiple - * interfaces if they'd like. - */ - if (driver->ops->ioctl(NULL, VFIO_CHECK_EXTENSION, arg) <= 0) { - module_put(driver->ops->owner); - continue; - } - - data = driver->ops->open(arg); - if (IS_ERR(data)) { - ret = PTR_ERR(data); - module_put(driver->ops->owner); - continue; - } - - ret = __vfio_container_attach_groups(container, driver, data); - if (ret) { - driver->ops->release(data); - module_put(driver->ops->owner); - continue; - } - - container->iommu_driver = driver; - container->iommu_data = data; - break; - } - - mutex_unlock(&vfio.iommu_drivers_lock); - up_write(&container->group_lock); - - return ret; -} - -static long vfio_fops_unl_ioctl(struct file *filep, - unsigned int cmd, unsigned long arg) -{ - struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver; - void *data; - long ret = -EINVAL; - - if (!container) - return ret; - - switch (cmd) { - case VFIO_GET_API_VERSION: - ret = VFIO_API_VERSION; - break; - case VFIO_CHECK_EXTENSION: - ret = vfio_container_ioctl_check_extension(container, arg); - break; - case VFIO_SET_IOMMU: - ret = vfio_ioctl_set_iommu(container, arg); - break; - default: - driver = container->iommu_driver; - data = container->iommu_data; - - if (driver) /* passthrough all unrecognized ioctls */ - ret = driver->ops->ioctl(data, cmd, arg); - } - - return ret; -} - -static int vfio_fops_open(struct inode *inode, struct file *filep) -{ - struct vfio_container *container; - - container = kzalloc(sizeof(*container), GFP_KERNEL); - if (!container) - return -ENOMEM; - - INIT_LIST_HEAD(&container->group_list); - init_rwsem(&container->group_lock); - kref_init(&container->kref); - - filep->private_data = container; - - return 0; -} - -static int vfio_fops_release(struct inode *inode, struct file *filep) -{ - struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; - - if (driver && driver->ops->notify) - driver->ops->notify(container->iommu_data, - VFIO_IOMMU_CONTAINER_CLOSE); - - filep->private_data = NULL; - - vfio_container_put(container); - - return 0; -} - -static const struct file_operations vfio_fops = { - .owner = THIS_MODULE, - .open = vfio_fops_open, - .release = vfio_fops_release, - .unlocked_ioctl = vfio_fops_unl_ioctl, - .compat_ioctl = compat_ptr_ioctl, -}; - /* * VFIO Group fd, /dev/vfio/$GROUP */ -static void vfio_group_detach_container(struct vfio_group *group) -{ - struct vfio_container *container = group->container; - struct vfio_iommu_driver *driver; - - lockdep_assert_held_write(&group->group_rwsem); - WARN_ON(group->container_users != 1); - - down_write(&container->group_lock); - - driver = container->iommu_driver; - if (driver) - driver->ops->detach_group(container->iommu_data, - group->iommu_group); - - if (group->type == VFIO_IOMMU) - iommu_group_release_dma_owner(group->iommu_group); - - group->container = NULL; - group->container_users = 0; - list_del(&group->container_next); - - /* Detaching the last group deprivileges a container, remove iommu */ - if (driver && list_empty(&container->group_list)) { - driver->ops->release(container->iommu_data); - module_put(driver->ops->owner); - container->iommu_driver = NULL; - container->iommu_data = NULL; - } - - up_write(&container->group_lock); - - vfio_container_put(container); -} - /* * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or * if there was no container to unset. Since the ioctl is called on @@ -1093,71 +664,6 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) return ret; } -static struct vfio_container *vfio_container_from_file(struct file *file) -{ - struct vfio_container *container; - - /* Sanity check, is this really our fd? */ - if (file->f_op != &vfio_fops) - return NULL; - - container = file->private_data; - WARN_ON(!container); /* fget ensures we don't race vfio_release */ - return container; -} - -static int vfio_container_attach_group(struct vfio_container *container, - struct vfio_group *group) -{ - struct vfio_iommu_driver *driver; - int ret = 0; - - lockdep_assert_held_write(&group->group_rwsem); - - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - down_write(&container->group_lock); - - /* Real groups and fake groups cannot mix */ - if (!list_empty(&container->group_list) && - container->noiommu != (group->type == VFIO_NO_IOMMU)) { - ret = -EPERM; - goto out_unlock_container; - } - - if (group->type == VFIO_IOMMU) { - ret = iommu_group_claim_dma_owner(group->iommu_group, group); - if (ret) - goto out_unlock_container; - } - - driver = container->iommu_driver; - if (driver) { - ret = driver->ops->attach_group(container->iommu_data, - group->iommu_group, - group->type); - if (ret) { - if (group->type == VFIO_IOMMU) - iommu_group_release_dma_owner( - group->iommu_group); - goto out_unlock_container; - } - } - - group->container = container; - group->container_users = 1; - container->noiommu = (group->type == VFIO_NO_IOMMU); - list_add(&group->container_next, &container->group_list); - - /* Get a reference on the container and mark a user within the group */ - vfio_container_get(container); - -out_unlock_container: - up_write(&container->group_lock); - return ret; -} - static int vfio_group_ioctl_set_container(struct vfio_group *group, int __user *arg) { @@ -1194,58 +700,11 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, static const struct file_operations vfio_device_fops; /* true if the vfio_device has open_device() called but not close_device() */ -static bool vfio_assert_device_open(struct vfio_device *device) +bool vfio_assert_device_open(struct vfio_device *device) { return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static int vfio_device_assign_container(struct vfio_device *device) -{ - struct vfio_group *group = device->group; - - lockdep_assert_held_write(&group->group_rwsem); - - if (!group->container || !group->container->iommu_driver || - WARN_ON(!group->container_users)) - return -EINVAL; - - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) - return -EPERM; - - get_file(group->opened_file); - group->container_users++; - return 0; -} - -static void vfio_device_unassign_container(struct vfio_device *device) -{ - down_write(&device->group->group_rwsem); - WARN_ON(device->group->container_users <= 1); - device->group->container_users--; - fput(device->group->opened_file); - up_write(&device->group->group_rwsem); -} - -static void vfio_device_container_register(struct vfio_device *device) -{ - struct vfio_iommu_driver *iommu_driver = - device->group->container->iommu_driver; - - if (iommu_driver && iommu_driver->ops->register_device) - iommu_driver->ops->register_device( - device->group->container->iommu_data, device); -} - -static void vfio_device_container_unregister(struct vfio_device *device) -{ - struct vfio_iommu_driver *iommu_driver = - device->group->container->iommu_driver; - - if (iommu_driver && iommu_driver->ops->unregister_device) - iommu_driver->ops->unregister_device( - device->group->container->iommu_data, device); -} - static struct file *vfio_device_open(struct vfio_device *device) { struct file *filep; @@ -2281,114 +1740,6 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, } EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); -/* - * Pin contiguous user pages and return their associated host pages for local - * domain only. - * @device [in] : device - * @iova [in] : starting IOVA of user pages to be pinned. - * @npage [in] : count of pages to be pinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - * @prot [in] : protection flags - * @pages[out] : array of host pages - * Return error or number of pages pinned. - * - * A driver may only call this function if the vfio_device was created - * by vfio_register_emulated_iommu_dev(). - */ -int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, - int npage, int prot, struct page **pages) -{ - struct vfio_container *container; - struct vfio_group *group = device->group; - struct vfio_iommu_driver *driver; - int ret; - - if (!pages || !npage || !vfio_assert_device_open(device)) - return -EINVAL; - - if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) - return -E2BIG; - - /* group->container cannot change while a vfio device is open */ - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, iova, - npage, prot, pages); - else - ret = -ENOTTY; - - return ret; -} -EXPORT_SYMBOL(vfio_pin_pages); - -/* - * Unpin contiguous host pages for local domain only. - * @device [in] : device - * @iova [in] : starting address of user pages to be unpinned. - * @npage [in] : count of pages to be unpinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - */ -void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - - if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) - return; - - if (WARN_ON(!vfio_assert_device_open(device))) - return; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; - - driver->ops->unpin_pages(container->iommu_data, iova, npage); -} -EXPORT_SYMBOL(vfio_unpin_pages); - -/* - * This interface allows the CPUs to perform some sort of virtual DMA on - * behalf of the device. - * - * CPUs read/write from/into a range of IOVAs pointing to user space memory - * into/from a kernel buffer. - * - * As the read/write of user space memory is conducted via the CPUs and is - * not a real device DMA, it is not necessary to pin the user space memory. - * - * @device [in] : VFIO device - * @iova [in] : base IOVA of a user space buffer - * @data [in] : pointer to kernel buffer - * @len [in] : kernel buffer length - * @write : indicate read or write - * Return error code on failure or 0 on success. - */ -int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, - size_t len, bool write) -{ - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret = 0; - - if (!data || len <= 0 || !vfio_assert_device_open(device)) - return -EINVAL; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; - - if (likely(driver && driver->ops->dma_rw)) - ret = driver->ops->dma_rw(container->iommu_data, - iova, data, len, write); - else - ret = -ENOTTY; - return ret; -} -EXPORT_SYMBOL(vfio_dma_rw); - /* * Module/class support */ @@ -2397,47 +1748,6 @@ static char *vfio_devnode(struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); } -static struct miscdevice vfio_dev = { - .minor = VFIO_MINOR, - .name = "vfio", - .fops = &vfio_fops, - .nodename = "vfio/vfio", - .mode = S_IRUGO | S_IWUGO, -}; - -static int __init vfio_container_init(void) -{ - int ret; - - mutex_init(&vfio.iommu_drivers_lock); - INIT_LIST_HEAD(&vfio.iommu_drivers_list); - - ret = misc_register(&vfio_dev); - if (ret) { - pr_err("vfio: misc device register failed\n"); - return ret; - } - - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) { - ret = vfio_register_iommu_driver(&vfio_noiommu_ops); - if (ret) - goto err_misc; - } - return 0; - -err_misc: - misc_deregister(&vfio_dev); - return ret; -} - -static void vfio_container_cleanup(void) -{ - if (IS_ENABLED(CONFIG_VFIO_NOIOMMU)) - vfio_unregister_iommu_driver(&vfio_noiommu_ops); - misc_deregister(&vfio_dev); - mutex_destroy(&vfio.iommu_drivers_lock); -} - static int __init vfio_init(void) { int ret; From ca5f21b2574903a7430fcb3590e534d92b2fa816 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 22 Sep 2022 21:06:10 -0300 Subject: [PATCH 51/77] vfio: Follow a strict lifetime for struct iommu_group The iommu_group comes from the struct device that a driver has been bound to and then created a struct vfio_device against. To keep the iommu layer sane we want to have a simple rule that only an attached driver should be using the iommu API. Particularly only an attached driver should hold ownership. In VFIO's case since it uses the group APIs and it shares between different drivers it is a bit more complicated, but the principle still holds. Solve this by waiting for all users of the vfio_group to stop before allowing vfio_unregister_group_dev() to complete. This is done with a new completion to know when the users go away and an additional refcount to keep track of how many device drivers are sharing the vfio group. The last driver to be unregistered will clean up the group. This solves crashes in the S390 iommu driver that come because VFIO ends up racing releasing ownership (which attaches the default iommu_domain to the device) with the removal of that same device from the iommu driver. This is a side case that iommu drivers should not have to cope with. iommu driver failed to attach the default/blocking domain WARNING: CPU: 0 PID: 5082 at drivers/iommu/iommu.c:1961 iommu_detach_group+0x6c/0x80 Modules linked in: macvtap macvlan tap vfio_pci vfio_pci_core irqbypass vfio_virqfd kvm nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink mlx5_ib sunrpc ib_uverbs ism smc uvdevice ib_core s390_trng eadm_sch tape_3590 tape tape_class vfio_ccw mdev vfio_iommu_type1 vfio zcrypt_cex4 sch_fq_codel configfs ghash_s390 prng chacha_s390 libchacha aes_s390 mlx5_core des_s390 libdes sha3_512_s390 nvme sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common nvme_core zfcp scsi_transport_fc pkey zcrypt rng_core autofs4 CPU: 0 PID: 5082 Comm: qemu-system-s39 Tainted: G W 6.0.0-rc3 #5 Hardware name: IBM 3931 A01 782 (LPAR) Krnl PSW : 0704c00180000000 000000095bb10d28 (iommu_detach_group+0x70/0x80) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 Krnl GPRS: 0000000000000001 0000000900000027 0000000000000039 000000095c97ffe0 00000000fffeffff 00000009fc290000 00000000af1fda50 00000000af590b58 00000000af1fdaf0 0000000135c7a320 0000000135e52258 0000000135e52200 00000000a29e8000 00000000af590b40 000000095bb10d24 0000038004b13c98 Krnl Code: 000000095bb10d18: c020003d56fc larl %r2,000000095c2bbb10 000000095bb10d1e: c0e50019d901 brasl %r14,000000095be4bf20 #000000095bb10d24: af000000 mc 0,0 >000000095bb10d28: b904002a lgr %r2,%r10 000000095bb10d2c: ebaff0a00004 lmg %r10,%r15,160(%r15) 000000095bb10d32: c0f4001aa867 brcl 15,000000095be65e00 000000095bb10d38: c004002168e0 brcl 0,000000095bf3def8 000000095bb10d3e: eb6ff0480024 stmg %r6,%r15,72(%r15) Call Trace: [<000000095bb10d28>] iommu_detach_group+0x70/0x80 ([<000000095bb10d24>] iommu_detach_group+0x6c/0x80) [<000003ff80243b0e>] vfio_iommu_type1_detach_group+0x136/0x6c8 [vfio_iommu_type1] [<000003ff80137780>] __vfio_group_unset_container+0x58/0x158 [vfio] [<000003ff80138a16>] vfio_group_fops_unl_ioctl+0x1b6/0x210 [vfio] pci 0004:00:00.0: Removing from iommu group 4 [<000000095b5b62e8>] __s390x_sys_ioctl+0xc0/0x100 [<000000095be5d3b4>] __do_syscall+0x1d4/0x200 [<000000095be6c072>] system_call+0x82/0xb0 Last Breaking-Event-Address: [<000000095be4bf80>] __warn_printk+0x60/0x68 It indicates that domain->ops->attach_dev() failed because the driver has already passed the point of destructing the device. Fixes: 9ac8545199a1 ("iommu: Fix use-after-free in iommu_release_device") Reported-by: Matthew Rosato Tested-by: Matthew Rosato Reviewed-by: Yi Liu Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/0-v2-a3c5f4429e2a+55-iommu_group_lifetime_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.h | 8 +++++ drivers/vfio/vfio_main.c | 68 ++++++++++++++++++++++++++-------------- 2 files changed, 53 insertions(+), 23 deletions(-) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 56fab31f8e0f..039e3208d286 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -41,7 +41,15 @@ enum vfio_group_type { struct vfio_group { struct device dev; struct cdev cdev; + /* + * When drivers is non-zero a driver is attached to the struct device + * that provided the iommu_group and thus the iommu_group is a valid + * pointer. When drivers is 0 the driver is being detached. Once users + * reaches 0 then the iommu_group is invalid. + */ + refcount_t drivers; refcount_t users; + struct completion users_comp; unsigned int container_users; struct iommu_group *iommu_group; struct vfio_container *container; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index af5945c71c41..f19171cad9a2 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -125,8 +125,6 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -static void vfio_group_get(struct vfio_group *group); - /* * Group objects - create, release, get, put, search */ @@ -137,7 +135,7 @@ __vfio_group_get_from_iommu(struct iommu_group *iommu_group) list_for_each_entry(group, &vfio.group_list, vfio_next) { if (group->iommu_group == iommu_group) { - vfio_group_get(group); + refcount_inc(&group->drivers); return group; } } @@ -189,6 +187,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, group->cdev.owner = THIS_MODULE; refcount_set(&group->users, 1); + refcount_set(&group->drivers, 1); + init_completion(&group->users_comp); init_rwsem(&group->group_rwsem); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); @@ -247,8 +247,41 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, static void vfio_group_put(struct vfio_group *group) { - if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock)) + if (refcount_dec_and_test(&group->users)) + complete(&group->users_comp); +} + +static void vfio_device_remove_group(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) + iommu_group_remove_device(device->dev); + + /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ + if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) return; + list_del(&group->vfio_next); + + /* + * We could concurrently probe another driver in the group that might + * race vfio_device_remove_group() with vfio_get_group(), so we have to + * ensure that the sysfs is all cleaned up under lock otherwise the + * cdev_device_add() will fail due to the name aready existing. + */ + cdev_device_del(&group->cdev, &group->dev); + mutex_unlock(&vfio.group_lock); + + /* Matches the get from vfio_group_alloc() */ + vfio_group_put(group); + + /* + * Before we allow the last driver in the group to be unplugged the + * group must be sanitized so nothing else is or can reference it. This + * is because the group->iommu_group pointer should only be used so long + * as a device driver is attached to a device in the group. + */ + wait_for_completion(&group->users_comp); /* * These data structures all have paired operations that can only be @@ -259,19 +292,11 @@ static void vfio_group_put(struct vfio_group *group) WARN_ON(!list_empty(&group->device_list)); WARN_ON(group->container || group->container_users); WARN_ON(group->notifier.head); - - list_del(&group->vfio_next); - cdev_device_del(&group->cdev, &group->dev); - mutex_unlock(&vfio.group_lock); + group->iommu_group = NULL; put_device(&group->dev); } -static void vfio_group_get(struct vfio_group *group) -{ - refcount_inc(&group->users); -} - /* * Device objects - create, release, get, put, search */ @@ -494,6 +519,10 @@ static int __vfio_register_dev(struct vfio_device *device, struct vfio_device *existing_device; int ret; + /* + * In all cases group is the output of one of the group allocation + * functions and we have group->drivers incremented for us. + */ if (IS_ERR(group)) return PTR_ERR(group); @@ -533,10 +562,7 @@ static int __vfio_register_dev(struct vfio_device *device, return 0; err_out: - if (group->type == VFIO_NO_IOMMU || - group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - vfio_group_put(group); + vfio_device_remove_group(device); return ret; } @@ -627,11 +653,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) /* Balances device_add in register path */ device_del(&device->device); - if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - - /* Matches the get in vfio_register_group_dev() */ - vfio_group_put(group); + vfio_device_remove_group(device); } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); @@ -884,7 +906,7 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) down_write(&group->group_rwsem); - /* users can be zero if this races with vfio_group_put() */ + /* users can be zero if this races with vfio_device_remove_group() */ if (!refcount_inc_not_zero(&group->users)) { ret = -ENODEV; goto err_unlock; From 948f5ada58b552d975d1937a3f5939414f28cacb Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:28 +0800 Subject: [PATCH 52/77] hisi_acc_vfio_pci: Fixes error return code issue During the process of compatibility and matching of live migration device information, if the isolation status of the two devices is inconsistent, the live migration needs to be exited. The current driver does not return the error code correctly and needs to be fixed. Reviewed-by: Shameer Kolothum Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-2-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 47174e2b61bd..4ef9761ef467 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -397,7 +397,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (vf_data->que_iso_cfg != que_iso_state) { dev_err(dev, "failed to match isolation state\n"); - return ret; + return -EINVAL; } ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); From 008e5e996f425f64c21755ebe77201895bbee3b8 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:29 +0800 Subject: [PATCH 53/77] hisi_acc_vfio_pci: Fix device data address combination problem The queue address of the accelerator device should be combined into a dma address in a way of combining the low and high bits. The previous combination is wrong and needs to be modified. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-3-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 4ef9761ef467..fbe72ce173de 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -520,12 +520,12 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; /* Every reg is 32 bit, the dma address is 64 bit. */ - vf_data->eqe_dma = vf_data->qm_eqc_dw[2]; + vf_data->eqe_dma = vf_data->qm_eqc_dw[1]; vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->eqe_dma |= vf_data->qm_eqc_dw[1]; - vf_data->aeqe_dma = vf_data->qm_aeqc_dw[2]; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[0]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1]; vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[1]; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0]; /* Through SQC_BT/CQC_BT to get sqc and cqc address */ ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma); From af72f53c1b4e9614448b5d4e7b39d30d3339e3f7 Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:30 +0800 Subject: [PATCH 54/77] hisi_acc_vfio_pci: Remove useless function parameter Remove unused function parameters for vf_qm_fun_reset() and ensure the device is enabled before the reset operation is performed. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-4-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index fbe72ce173de..c07ed7b0ccf1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -337,8 +337,7 @@ static int vf_qm_cache_wb(struct hisi_qm *qm) return 0; } -static void vf_qm_fun_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev, - struct hisi_qm *qm) +static void vf_qm_fun_reset(struct hisi_qm *qm) { int i; @@ -662,7 +661,10 @@ static void hisi_acc_vf_start_device(struct hisi_acc_vf_core_device *hisi_acc_vd if (hisi_acc_vdev->vf_qm_state != QM_READY) return; - vf_qm_fun_reset(hisi_acc_vdev, vf_qm); + /* Make sure the device is enabled */ + qm_dev_cmd_init(vf_qm); + + vf_qm_fun_reset(vf_qm); } static int hisi_acc_vf_load_state(struct hisi_acc_vf_core_device *hisi_acc_vdev) From 3b7cfba0d873e8a26ac4ec5848dcb7c93098cfab Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:31 +0800 Subject: [PATCH 55/77] hisi_acc_vfio_pci: Remove useless macro definitions The QM_QUE_ISO_CFG macro definition is no longer used and needs to be deleted from the current driver. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-5-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 5494f4983bbe..8e4bf21deae1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -16,7 +16,6 @@ #define SEC_CORE_INT_STATUS 0x301008 #define HPRE_HAC_INT_STATUS 0x301800 #define HZIP_CORE_INT_STATUS 0x3010AC -#define QM_QUE_ISO_CFG 0x301154 #define QM_VFT_CFG_RDY 0x10006c #define QM_VFT_CFG_OP_WR 0x100058 From 42e1d1eed20a17c6cbb1d600c77a6ca69a632d4c Mon Sep 17 00:00:00 2001 From: Longfang Liu Date: Mon, 26 Sep 2022 17:33:32 +0800 Subject: [PATCH 56/77] hisi_acc_vfio_pci: Update some log and comment formats 1. Modify some annotation information formats to keep the entire driver annotation format consistent. 2. Modify some log description formats to be consistent with the format of the entire driver log. Reviewed-by: Jason Gunthorpe Signed-off-by: Longfang Liu Link: https://lore.kernel.org/r/20220926093332.28824-6-liulongfang@huawei.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 18 +++++++++--------- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index c07ed7b0ccf1..39eeca18a0f7 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -16,7 +16,7 @@ #include "hisi_acc_vfio_pci.h" -/* return 0 on VM acc device ready, -ETIMEDOUT hardware timeout */ +/* Return 0 on VM acc device ready, -ETIMEDOUT hardware timeout */ static int qm_wait_dev_not_ready(struct hisi_qm *qm) { u32 val; @@ -189,7 +189,7 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) struct device *dev = &qm->pdev->dev; int ret; - /* check VF state */ + /* Check VF state */ if (unlikely(hisi_qm_wait_mb_ready(qm))) { dev_err(&qm->pdev->dev, "QM device is not ready to write\n"); return -EBUSY; @@ -373,7 +373,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; } - /* vf qp num check */ + /* VF qp num check */ ret = qm_get_vft(vf_qm, &vf_qm->qp_base); if (ret <= 0) { dev_err(dev, "failed to get vft qp nums\n"); @@ -387,7 +387,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, vf_qm->qp_num = ret; - /* vf isolation state check */ + /* VF isolation state check */ ret = qm_read_regs(pf_qm, QM_QUE_ISO_CFG_V, &que_iso_state, 1); if (ret) { dev_err(dev, "failed to read QM_QUE_ISO_CFG_V\n"); @@ -418,10 +418,10 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, int ret; vf_data->acc_magic = ACC_DEV_MAGIC; - /* save device id */ + /* Save device id */ vf_data->dev_id = hisi_acc_vdev->vf_dev->device; - /* vf qp num save from PF */ + /* VF qp num save from PF */ ret = pf_qm_get_qp_num(pf_qm, vf_id, &vf_data->qp_base); if (ret <= 0) { dev_err(dev, "failed to get vft qp nums!\n"); @@ -465,19 +465,19 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = qm_set_regs(qm, vf_data); if (ret) { - dev_err(dev, "Set VF regs failed\n"); + dev_err(dev, "set VF regs failed\n"); return ret; } ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_BT, qm->sqc_dma, 0, 0); if (ret) { - dev_err(dev, "Set sqc failed\n"); + dev_err(dev, "set sqc failed\n"); return ret; } ret = hisi_qm_mb(qm, QM_MB_CMD_CQC_BT, qm->cqc_dma, 0, 0); if (ret) { - dev_err(dev, "Set cqc failed\n"); + dev_err(dev, "set cqc failed\n"); return ret; } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 8e4bf21deae1..67343325b320 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -79,7 +79,7 @@ struct acc_vf_data { /* QM reserved 5 regs */ u32 qm_rsv_regs[5]; u32 padding; - /* qm memory init information */ + /* QM memory init information */ u64 eqe_dma; u64 aeqe_dma; u64 sqc_dma; @@ -98,7 +98,7 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; u8 deferred_reset:1; - /* for migration state */ + /* For migration state */ struct mutex state_mutex; enum vfio_device_mig_state mig_state; struct pci_dev *pf_dev; @@ -107,7 +107,7 @@ struct hisi_acc_vf_core_device { struct hisi_qm vf_qm; u32 vf_qm_state; int vf_id; - /* for reset handler */ + /* For reset handler */ spinlock_t reset_lock; struct hisi_acc_vf_migration_file *resuming_migf; struct hisi_acc_vf_migration_file *saving_migf; From f423fa1bc9fe1978e6b9f54927411b62cb43eb04 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 29 Sep 2022 14:48:35 -0300 Subject: [PATCH 57/77] drm/i915/gvt: Add missing vfio_unregister_group_dev() call When converting to directly create the vfio_device the mdev driver has to put a vfio_register_emulated_iommu_dev() in the probe() and a pairing vfio_unregister_group_dev() in the remove. This was missed for gvt, add it. Cc: stable@vger.kernel.org Fixes: 978cf586ac35 ("drm/i915/gvt: convert to use vfio_register_emulated_iommu_dev") Reported-by: Alex Williamson Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/0-v1-013609965fe8+9d-vfio_gvt_unregister_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 41bba40feef8..9003145adb5a 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1615,6 +1615,7 @@ static void intel_vgpu_remove(struct mdev_device *mdev) if (WARN_ON_ONCE(vgpu->attached)) return; + vfio_unregister_group_dev(&vgpu->vfio_device); vfio_put_device(&vgpu->vfio_device); } From 98828955971363e838149105c268b1fad905f15b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:39 +0200 Subject: [PATCH 58/77] drm/i915/gvt: fix a memory leak in intel_gvt_init_vgpu_types gvt->types needs to be freed on error. Fixes: bc90d097ae14 ("drm/i915/gvt: define weight according to vGPU type") Reported-by: Kevin Tian Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Zhenyu Wang Link: https://lore.kernel.org/r/20220923092652.100656-2-hch@lst.de [aw: Correct fixes commit ID as reported by Stephen Rothwell] Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/vgpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 5c533fbc2c8d..dbb2a971ba5d 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -142,7 +142,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) if (vgpu_types[i].weight < 1 || vgpu_types[i].weight > VGPU_MAX_WEIGHT) - return -EINVAL; + goto out_free_types; gvt->types[i].weight = vgpu_types[i].weight; gvt->types[i].resolution = vgpu_types[i].edid; @@ -167,6 +167,10 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) gvt->num_types = i; return 0; + +out_free_types: + kfree(gvt->types); + return -EINVAL; } void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt) From 1aa3834f510c9d9206ce4d40aff4903b0c016761 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:40 +0200 Subject: [PATCH 59/77] drm/i915/gvt: simplify vgpu configuration management Instead of copying the information from the vgpu_types arrays into each intel_vgpu_type structure, just reference this constant information with a pointer to the already existing data structure, and pass it into the low-level VGPU creation helpers intead of copying the data into yet anothe params data structure. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Zhenyu Wang Link: https://lore.kernel.org/r/20220923092652.100656-3-hch@lst.de [aw: Fold fix from 20220928121110.GA30738@lst.de] Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/aperture_gm.c | 20 ++-- drivers/gpu/drm/i915/gvt/gvt.h | 37 +++--- drivers/gpu/drm/i915/gvt/kvmgt.c | 10 +- drivers/gpu/drm/i915/gvt/vgpu.c | 159 ++++++++----------------- 4 files changed, 81 insertions(+), 145 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/aperture_gm.c b/drivers/gpu/drm/i915/gvt/aperture_gm.c index 3b81a6d35a7b..076c779f776a 100644 --- a/drivers/gpu/drm/i915/gvt/aperture_gm.c +++ b/drivers/gpu/drm/i915/gvt/aperture_gm.c @@ -240,13 +240,13 @@ static void free_resource(struct intel_vgpu *vgpu) } static int alloc_resource(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param) + const struct intel_vgpu_config *conf) { struct intel_gvt *gvt = vgpu->gvt; unsigned long request, avail, max, taken; const char *item; - if (!param->low_gm_sz || !param->high_gm_sz || !param->fence_sz) { + if (!conf->low_mm || !conf->high_mm || !conf->fence) { gvt_vgpu_err("Invalid vGPU creation params\n"); return -EINVAL; } @@ -255,7 +255,7 @@ static int alloc_resource(struct intel_vgpu *vgpu, max = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; taken = gvt->gm.vgpu_allocated_low_gm_size; avail = max - taken; - request = MB_TO_BYTES(param->low_gm_sz); + request = conf->low_mm; if (request > avail) goto no_enough_resource; @@ -266,7 +266,7 @@ static int alloc_resource(struct intel_vgpu *vgpu, max = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; taken = gvt->gm.vgpu_allocated_high_gm_size; avail = max - taken; - request = MB_TO_BYTES(param->high_gm_sz); + request = conf->high_mm; if (request > avail) goto no_enough_resource; @@ -277,16 +277,16 @@ static int alloc_resource(struct intel_vgpu *vgpu, max = gvt_fence_sz(gvt) - HOST_FENCE; taken = gvt->fence.vgpu_allocated_fence_num; avail = max - taken; - request = param->fence_sz; + request = conf->fence; if (request > avail) goto no_enough_resource; vgpu_fence_sz(vgpu) = request; - gvt->gm.vgpu_allocated_low_gm_size += MB_TO_BYTES(param->low_gm_sz); - gvt->gm.vgpu_allocated_high_gm_size += MB_TO_BYTES(param->high_gm_sz); - gvt->fence.vgpu_allocated_fence_num += param->fence_sz; + gvt->gm.vgpu_allocated_low_gm_size += conf->low_mm; + gvt->gm.vgpu_allocated_high_gm_size += conf->high_mm; + gvt->fence.vgpu_allocated_fence_num += conf->fence; return 0; no_enough_resource: @@ -340,11 +340,11 @@ void intel_vgpu_reset_resource(struct intel_vgpu *vgpu) * */ int intel_vgpu_alloc_resource(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param) + const struct intel_vgpu_config *conf) { int ret; - ret = alloc_resource(vgpu, param); + ret = alloc_resource(vgpu, conf); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 89fab7896fc6..563ffc2fdfb7 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -294,15 +294,26 @@ struct intel_gvt_firmware { bool firmware_loaded; }; +struct intel_vgpu_config { + unsigned int low_mm; + unsigned int high_mm; + unsigned int fence; + + /* + * A vGPU with a weight of 8 will get twice as much GPU as a vGPU with + * a weight of 4 on a contended host, different vGPU type has different + * weight set. Legal weights range from 1 to 16. + */ + unsigned int weight; + enum intel_vgpu_edid edid; + const char *name; +}; + #define NR_MAX_INTEL_VGPU_TYPES 20 struct intel_vgpu_type { char name[16]; + const struct intel_vgpu_config *conf; unsigned int avail_instance; - unsigned int low_gm_size; - unsigned int high_gm_size; - unsigned int fence; - unsigned int weight; - enum intel_vgpu_edid resolution; }; struct intel_gvt { @@ -436,19 +447,8 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt); /* ring context size i.e. the first 0x50 dwords*/ #define RING_CTX_SIZE 320 -struct intel_vgpu_creation_params { - __u64 low_gm_sz; /* in MB */ - __u64 high_gm_sz; /* in MB */ - __u64 fence_sz; - __u64 resolution; - __s32 primary; - __u64 vgpu_id; - - __u32 weight; -}; - int intel_vgpu_alloc_resource(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param); + const struct intel_vgpu_config *conf); void intel_vgpu_reset_resource(struct intel_vgpu *vgpu); void intel_vgpu_free_resource(struct intel_vgpu *vgpu); void intel_vgpu_write_fence(struct intel_vgpu *vgpu, @@ -494,7 +494,8 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt); struct intel_vgpu *intel_gvt_create_idle_vgpu(struct intel_gvt *gvt); void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu); -int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type); +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, + const struct intel_vgpu_config *conf); void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu); void intel_gvt_release_vgpu(struct intel_vgpu *vgpu); void intel_gvt_reset_vgpu_locked(struct intel_vgpu *vgpu, bool dmlr, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 9003145adb5a..7f3596394645 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -151,10 +151,10 @@ static ssize_t description_show(struct mdev_type *mtype, return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n" "fence: %d\nresolution: %s\n" "weight: %d\n", - BYTES_TO_MB(type->low_gm_size), - BYTES_TO_MB(type->high_gm_size), - type->fence, vgpu_edid_str(type->resolution), - type->weight); + BYTES_TO_MB(type->conf->low_mm), + BYTES_TO_MB(type->conf->high_mm), + type->conf->fence, vgpu_edid_str(type->conf->edid), + type->conf->weight); } static ssize_t name_show(struct mdev_type *mtype, @@ -1559,7 +1559,7 @@ static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) return -EINVAL; vgpu->gvt = gvt; - return intel_gvt_create_vgpu(vgpu, type); + return intel_gvt_create_vgpu(vgpu, type->conf); } static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index dbb2a971ba5d..b0d5dafd013f 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -73,24 +73,21 @@ void populate_pvinfo_page(struct intel_vgpu *vgpu) drm_WARN_ON(&i915->drm, sizeof(struct vgt_if) != VGT_PVINFO_SIZE); } +/* + * vGPU type name is defined as GVTg_Vx_y which contains the physical GPU + * generation type (e.g V4 as BDW server, V5 as SKL server). + * + * Depening on the physical SKU resource, we might see vGPU types like + * GVTg_V4_8, GVTg_V4_4, GVTg_V4_2, etc. We can create different types of + * vGPU on same physical GPU depending on available resource. Each vGPU + * type will have a different number of avail_instance to indicate how + * many vGPU instance can be created for this type. + */ #define VGPU_MAX_WEIGHT 16 #define VGPU_WEIGHT(vgpu_num) \ (VGPU_MAX_WEIGHT / (vgpu_num)) -static const struct { - unsigned int low_mm; - unsigned int high_mm; - unsigned int fence; - - /* A vGPU with a weight of 8 will get twice as much GPU as a vGPU - * with a weight of 4 on a contended host, different vGPU type has - * different weight set. Legal weights range from 1 to 16. - */ - unsigned int weight; - enum intel_vgpu_edid edid; - const char *name; -} vgpu_types[] = { -/* Fixed vGPU type table */ +static const struct intel_vgpu_config intel_vgpu_configs[] = { { MB_TO_BYTES(64), MB_TO_BYTES(384), 4, VGPU_WEIGHT(8), GVT_EDID_1024_768, "8" }, { MB_TO_BYTES(128), MB_TO_BYTES(512), 4, VGPU_WEIGHT(4), GVT_EDID_1920_1200, "4" }, { MB_TO_BYTES(256), MB_TO_BYTES(1024), 4, VGPU_WEIGHT(2), GVT_EDID_1920_1200, "2" }, @@ -106,63 +103,34 @@ static const struct { */ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) { - unsigned int num_types; - unsigned int i, low_avail, high_avail; - unsigned int min_low; - - /* vGPU type name is defined as GVTg_Vx_y which contains - * physical GPU generation type (e.g V4 as BDW server, V5 as - * SKL server). - * - * Depend on physical SKU resource, might see vGPU types like - * GVTg_V4_8, GVTg_V4_4, GVTg_V4_2, etc. We can create - * different types of vGPU on same physical GPU depending on - * available resource. Each vGPU type will have "avail_instance" - * to indicate how many vGPU instance can be created for this - * type. - * - */ - low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; - high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; - num_types = ARRAY_SIZE(vgpu_types); + unsigned int low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE; + unsigned int high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE; + unsigned int num_types = ARRAY_SIZE(intel_vgpu_configs); + unsigned int i; gvt->types = kcalloc(num_types, sizeof(struct intel_vgpu_type), GFP_KERNEL); if (!gvt->types) return -ENOMEM; - min_low = MB_TO_BYTES(32); for (i = 0; i < num_types; ++i) { - if (low_avail / vgpu_types[i].low_mm == 0) + const struct intel_vgpu_config *conf = &intel_vgpu_configs[i]; + + if (low_avail / conf->low_mm == 0) break; - - gvt->types[i].low_gm_size = vgpu_types[i].low_mm; - gvt->types[i].high_gm_size = vgpu_types[i].high_mm; - gvt->types[i].fence = vgpu_types[i].fence; - - if (vgpu_types[i].weight < 1 || - vgpu_types[i].weight > VGPU_MAX_WEIGHT) + if (conf->weight < 1 || conf->weight > VGPU_MAX_WEIGHT) goto out_free_types; - gvt->types[i].weight = vgpu_types[i].weight; - gvt->types[i].resolution = vgpu_types[i].edid; - gvt->types[i].avail_instance = min(low_avail / vgpu_types[i].low_mm, - high_avail / vgpu_types[i].high_mm); - - if (GRAPHICS_VER(gvt->gt->i915) == 8) - sprintf(gvt->types[i].name, "GVTg_V4_%s", - vgpu_types[i].name); - else if (GRAPHICS_VER(gvt->gt->i915) == 9) - sprintf(gvt->types[i].name, "GVTg_V5_%s", - vgpu_types[i].name); + sprintf(gvt->types[i].name, "GVTg_V%u_%s", + GRAPHICS_VER(gvt->gt->i915) == 8 ? 4 : 5, conf->name); + gvt->types[i].conf = conf; + gvt->types[i].avail_instance = min(low_avail / conf->low_mm, + high_avail / conf->high_mm); gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", - i, gvt->types[i].name, - gvt->types[i].avail_instance, - gvt->types[i].low_gm_size, - gvt->types[i].high_gm_size, gvt->types[i].fence, - gvt->types[i].weight, - vgpu_edid_str(gvt->types[i].resolution)); + i, gvt->types[i].name, gvt->types[i].avail_instance, + conf->low_mm, conf->high_mm, conf->fence, + conf->weight, vgpu_edid_str(conf->edid)); } gvt->num_types = i; @@ -195,16 +163,16 @@ static void intel_gvt_update_vgpu_types(struct intel_gvt *gvt) gvt->fence.vgpu_allocated_fence_num; for (i = 0; i < gvt->num_types; i++) { - low_gm_min = low_gm_avail / gvt->types[i].low_gm_size; - high_gm_min = high_gm_avail / gvt->types[i].high_gm_size; - fence_min = fence_avail / gvt->types[i].fence; + low_gm_min = low_gm_avail / gvt->types[i].conf->low_mm; + high_gm_min = high_gm_avail / gvt->types[i].conf->high_mm; + fence_min = fence_avail / gvt->types[i].conf->fence; gvt->types[i].avail_instance = min(min(low_gm_min, high_gm_min), fence_min); gvt_dbg_core("update type[%d]: %s avail %u low %u high %u fence %u\n", i, gvt->types[i].name, - gvt->types[i].avail_instance, gvt->types[i].low_gm_size, - gvt->types[i].high_gm_size, gvt->types[i].fence); + gvt->types[i].avail_instance, gvt->types[i].conf->low_mm, + gvt->types[i].conf->high_mm, gvt->types[i].conf->fence); } } @@ -365,37 +333,38 @@ void intel_gvt_destroy_idle_vgpu(struct intel_vgpu *vgpu) vfree(vgpu); } -static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, - struct intel_vgpu_creation_params *param) +int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, + const struct intel_vgpu_config *conf) { struct intel_gvt *gvt = vgpu->gvt; struct drm_i915_private *dev_priv = gvt->gt->i915; int ret; - gvt_dbg_core("low %llu MB high %llu MB fence %llu\n", - param->low_gm_sz, param->high_gm_sz, - param->fence_sz); + gvt_dbg_core("low %u MB high %u MB fence %u\n", + BYTES_TO_MB(conf->low_mm), BYTES_TO_MB(conf->high_mm), + conf->fence); + mutex_lock(&gvt->lock); ret = idr_alloc(&gvt->vgpu_idr, vgpu, IDLE_VGPU_IDR + 1, GVT_MAX_VGPU, GFP_KERNEL); if (ret < 0) - return ret; + goto out_unlock;; vgpu->id = ret; - vgpu->sched_ctl.weight = param->weight; + vgpu->sched_ctl.weight = conf->weight; mutex_init(&vgpu->vgpu_lock); mutex_init(&vgpu->dmabuf_lock); INIT_LIST_HEAD(&vgpu->dmabuf_obj_list_head); INIT_RADIX_TREE(&vgpu->page_track_tree, GFP_KERNEL); idr_init_base(&vgpu->object_idr, 1); - intel_vgpu_init_cfg_space(vgpu, param->primary); + intel_vgpu_init_cfg_space(vgpu, 1); vgpu->d3_entered = false; ret = intel_vgpu_init_mmio(vgpu); if (ret) goto out_clean_idr; - ret = intel_vgpu_alloc_resource(vgpu, param); + ret = intel_vgpu_alloc_resource(vgpu, conf); if (ret) goto out_clean_vgpu_mmio; @@ -409,7 +378,7 @@ static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, if (ret) goto out_clean_gtt; - ret = intel_vgpu_init_display(vgpu, param->resolution); + ret = intel_vgpu_init_display(vgpu, conf->edid); if (ret) goto out_clean_opregion; @@ -434,6 +403,9 @@ static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, if (ret) goto out_clean_sched_policy; + intel_gvt_update_vgpu_types(gvt); + intel_gvt_update_reg_whitelist(vgpu); + mutex_unlock(&gvt->lock); return 0; out_clean_sched_policy: @@ -452,45 +424,8 @@ static int __intel_gvt_create_vgpu(struct intel_vgpu *vgpu, intel_vgpu_clean_mmio(vgpu); out_clean_idr: idr_remove(&gvt->vgpu_idr, vgpu->id); - return ret; -} - -/** - * intel_gvt_create_vgpu - create a virtual GPU - * @gvt: GVT device - * @type: type of the vGPU to create - * - * This function is called when user wants to create a virtual GPU. - * - * Returns: - * pointer to intel_vgpu, error pointer if failed. - */ -int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, struct intel_vgpu_type *type) -{ - struct intel_gvt *gvt = vgpu->gvt; - struct intel_vgpu_creation_params param; - int ret; - - param.primary = 1; - param.low_gm_sz = type->low_gm_size; - param.high_gm_sz = type->high_gm_size; - param.fence_sz = type->fence; - param.weight = type->weight; - param.resolution = type->resolution; - - /* XXX current param based on MB */ - param.low_gm_sz = BYTES_TO_MB(param.low_gm_sz); - param.high_gm_sz = BYTES_TO_MB(param.high_gm_sz); - - mutex_lock(&gvt->lock); - ret = __intel_gvt_create_vgpu(vgpu, ¶m); - if (!ret) { - /* calculate left instance change for types */ - intel_gvt_update_vgpu_types(gvt); - intel_gvt_update_reg_whitelist(vgpu); - } +out_unlock: mutex_unlock(&gvt->lock); - return ret; } From bdef2b7896df293736330eb6eb0f43947049b828 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:41 +0200 Subject: [PATCH 60/77] vfio/mdev: make mdev.h standalone includable Include and so that users of this headers don't need to do that and remove those includes that aren't needed any more. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Farman Reviewed-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-4-hch@lst.de Signed-off-by: Alex Williamson --- drivers/gpu/drm/i915/gvt/kvmgt.c | 2 -- drivers/s390/cio/vfio_ccw_drv.c | 1 - drivers/s390/crypto/vfio_ap_private.h | 1 - drivers/vfio/mdev/mdev_core.c | 2 -- drivers/vfio/mdev/mdev_driver.c | 1 - drivers/vfio/mdev/mdev_sysfs.c | 2 -- include/linux/mdev.h | 3 +++ samples/vfio-mdev/mbochs.c | 1 - samples/vfio-mdev/mdpy.c | 1 - samples/vfio-mdev/mtty.c | 2 -- 10 files changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 7f3596394645..ee314402fb61 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -34,7 +34,6 @@ */ #include -#include #include #include #include @@ -43,7 +42,6 @@ #include #include #include -#include #include #include diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 86d9e428357b..e9985c63dc6b 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -12,7 +12,6 @@ #include #include -#include #include #include diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index d782cf463eab..163eeaaf24ce 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -13,7 +13,6 @@ #define _VFIO_AP_PRIVATE_H_ #include -#include #include #include #include diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index b8b9e7911e55..2c32923fbad2 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -8,9 +8,7 @@ */ #include -#include #include -#include #include #include diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 9c2af59809e2..7bd4bb9850e8 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -7,7 +7,6 @@ * Kirti Wankhede */ -#include #include #include diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 0ccfeb3dda24..4bfbf49aaa66 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -9,9 +9,7 @@ #include #include -#include #include -#include #include #include "mdev_private.h" diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 47ad3b104d9e..a5d8ae6132a2 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -10,6 +10,9 @@ #ifndef MDEV_H #define MDEV_H +#include +#include + struct mdev_type; struct mdev_device { diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 6901947e27d2..985b6e713621 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -21,7 +21,6 @@ */ #include #include -#include #include #include #include diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index bb2af1ec0f7c..1daab012b5d8 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -17,7 +17,6 @@ */ #include #include -#include #include #include #include diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index d151928e4f21..86843ce3d9a2 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -12,7 +12,6 @@ #include #include -#include #include #include #include @@ -20,7 +19,6 @@ #include #include #include -#include #include #include #include From 89345d5177aa0f6d678251e1e0870b0eeb1ab510 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:42 +0200 Subject: [PATCH 61/77] vfio/mdev: embedd struct mdev_parent in the parent data structure Simplify mdev_{un}register_device by requiring the caller to pass in a structure allocate as part of the parent device structure. This removes the need for a list of parents and the separate mdev_parent refcount as we can simplify rely on the reference to the parent device. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-5-hch@lst.de Signed-off-by: Alex Williamson --- .../driver-api/vfio-mediated-device.rst | 12 +- Documentation/s390/vfio-ap.rst | 2 +- Documentation/s390/vfio-ccw.rst | 2 +- drivers/gpu/drm/i915/gvt/gvt.h | 2 + drivers/gpu/drm/i915/gvt/kvmgt.c | 5 +- drivers/s390/cio/vfio_ccw_drv.c | 5 +- drivers/s390/cio/vfio_ccw_ops.c | 1 - drivers/s390/cio/vfio_ccw_private.h | 4 + drivers/s390/crypto/vfio_ap_ops.c | 5 +- drivers/s390/crypto/vfio_ap_private.h | 1 + drivers/vfio/mdev/mdev_core.c | 120 ++++-------------- drivers/vfio/mdev/mdev_private.h | 23 ---- drivers/vfio/mdev/mdev_sysfs.c | 4 +- include/linux/mdev.h | 15 ++- samples/vfio-mdev/mbochs.c | 5 +- samples/vfio-mdev/mdpy.c | 5 +- samples/vfio-mdev/mtty.c | 6 +- 17 files changed, 71 insertions(+), 146 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index f47dca6645aa..cd1667608ab5 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -58,19 +58,19 @@ devices as examples, as these devices are the first devices to use this module:: | MDEV CORE | | MODULE | | mdev.ko | - | +-----------+ | mdev_register_device() +--------------+ + | +-----------+ | mdev_register_parent() +--------------+ | | | +<------------------------+ | | | | | | nvidia.ko |<-> physical | | | +------------------------>+ | device | | | | callbacks +--------------+ | | Physical | | - | | device | | mdev_register_device() +--------------+ + | | device | | mdev_register_parent() +--------------+ | | interface | |<------------------------+ | | | | | | i915.ko |<-> physical | | | +------------------------>+ | device | | | | callbacks +--------------+ | | | | - | | | | mdev_register_device() +--------------+ + | | | | mdev_register_parent() +--------------+ | | | +<------------------------+ | | | | | | ccw_device.ko|<-> physical | | | +------------------------>+ | device @@ -125,8 +125,8 @@ vfio_device_ops. When a driver wants to add the GUID creation sysfs to an existing device it has probe'd to then it should call:: - int mdev_register_device(struct device *dev, - struct mdev_driver *mdev_driver); + int mdev_register_parent(struct mdev_parent *parent, struct device *dev, + struct mdev_driver *mdev_driver); This will provide the 'mdev_supported_types/XX/create' files which can then be used to trigger the creation of a mdev_device. The created mdev_device will be @@ -134,7 +134,7 @@ attached to the specified driver. When the driver needs to remove itself it calls:: - void mdev_unregister_device(struct device *dev); + void mdev_unregister_parent(struct mdev_parent *parent); Which will unbind and destroy all the created mdevs and remove the sysfs files. diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst index 61a0a3c6c7b4..00f4a04f6d4c 100644 --- a/Documentation/s390/vfio-ap.rst +++ b/Documentation/s390/vfio-ap.rst @@ -297,7 +297,7 @@ of the VFIO AP mediated device driver:: | MDEV CORE | | MODULE | | mdev.ko | - | +---------+ | mdev_register_device() +--------------+ + | +---------+ | mdev_register_parent() +--------------+ | |Physical | +<-----------------------+ | | | device | | | vfio_ap.ko |<-> matrix | |interface| +----------------------->+ | device diff --git a/Documentation/s390/vfio-ccw.rst b/Documentation/s390/vfio-ccw.rst index 8aad08a8b8a5..ea928a3806f4 100644 --- a/Documentation/s390/vfio-ccw.rst +++ b/Documentation/s390/vfio-ccw.rst @@ -156,7 +156,7 @@ Below is a high Level block diagram:: | MDEV CORE | | MODULE | | mdev.ko | - | +---------+ | mdev_register_device() +--------------+ + | +---------+ | mdev_register_parent() +--------------+ | |Physical | +<-----------------------+ | | | device | | | vfio_ccw.ko |<-> subchannel | |interface| +----------------------->+ | device diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index 563ffc2fdfb7..fa4a56b50c82 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -36,6 +36,7 @@ #include #include #include +#include #include "i915_drv.h" #include "intel_gvt.h" @@ -337,6 +338,7 @@ struct intel_gvt { struct intel_gvt_workload_scheduler scheduler; struct notifier_block shadow_ctx_notifier_block[I915_NUM_ENGINES]; DECLARE_HASHTABLE(cmd_table, GVT_CMD_HASH_BITS); + struct mdev_parent parent; struct intel_vgpu_type *types; unsigned int num_types; struct intel_vgpu *idle_vgpu; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index ee314402fb61..d7afe3f5f75b 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1923,7 +1923,7 @@ static void intel_gvt_clean_device(struct drm_i915_private *i915) if (drm_WARN_ON(&i915->drm, !gvt)) return; - mdev_unregister_device(i915->drm.dev); + mdev_unregister_parent(&gvt->parent); intel_gvt_cleanup_vgpu_type_groups(gvt); intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu); intel_gvt_clean_vgpu_types(gvt); @@ -2028,7 +2028,8 @@ static int intel_gvt_init_device(struct drm_i915_private *i915) if (ret) goto out_destroy_idle_vgpu; - ret = mdev_register_device(i915->drm.dev, &intel_vgpu_mdev_driver); + ret = mdev_register_parent(&gvt->parent, i915->drm.dev, + &intel_vgpu_mdev_driver); if (ret) goto out_cleanup_vgpu_type_groups; diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index e9985c63dc6b..7d105915bd14 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -221,7 +221,8 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) dev_set_drvdata(&sch->dev, private); - ret = mdev_register_device(&sch->dev, &vfio_ccw_mdev_driver); + ret = mdev_register_parent(&private->parent, &sch->dev, + &vfio_ccw_mdev_driver); if (ret) goto out_free; @@ -240,7 +241,7 @@ static void vfio_ccw_sch_remove(struct subchannel *sch) { struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); - mdev_unregister_device(&sch->dev); + mdev_unregister_parent(&private->parent); dev_set_drvdata(&sch->dev, NULL); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 9f8486c0d3d3..9a0e0c5ffb1a 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -11,7 +11,6 @@ */ #include -#include #include #include diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 63d9202b29c7..1a4bfb1b5a80 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,7 @@ struct vfio_ccw_crw { * @io_work: work for deferral process of I/O handling * @crw_work: work for deferral process of CRW handling * @release_comp: synchronization helper for vfio device release + * @parent: parent data structures for mdevs created */ struct vfio_ccw_private { struct vfio_device vdev; @@ -116,6 +118,8 @@ struct vfio_ccw_private { struct work_struct crw_work; struct completion release_comp; + + struct mdev_parent parent; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 161597357a64..724d09a74a8f 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1830,7 +1830,8 @@ int vfio_ap_mdev_register(void) if (ret) return ret; - ret = mdev_register_device(&matrix_dev->device, &vfio_ap_matrix_driver); + ret = mdev_register_parent(&matrix_dev->parent, &matrix_dev->device, + &vfio_ap_matrix_driver); if (ret) goto err_driver; return 0; @@ -1842,7 +1843,7 @@ int vfio_ap_mdev_register(void) void vfio_ap_mdev_unregister(void) { - mdev_unregister_device(&matrix_dev->device); + mdev_unregister_parent(&matrix_dev->parent); mdev_unregister_driver(&vfio_ap_matrix_driver); } diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 163eeaaf24ce..35165730f517 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -52,6 +52,7 @@ struct ap_matrix_dev { struct mutex mdevs_lock; /* serializes access to each ap_matrix_mdev */ struct ap_driver *vfio_ap_drv; struct mutex guests_lock; /* serializes access to each KVM guest */ + struct mdev_parent parent; }; extern struct ap_matrix_dev *matrix_dev; diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 2c32923fbad2..fa05ac339695 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -18,8 +18,6 @@ #define DRIVER_AUTHOR "NVIDIA Corporation" #define DRIVER_DESC "Mediated device Core Driver" -static LIST_HEAD(parent_list); -static DEFINE_MUTEX(parent_list_lock); static struct class_compat *mdev_bus_compat_class; static LIST_HEAD(mdev_list); @@ -61,28 +59,6 @@ struct device *mtype_get_parent_dev(struct mdev_type *mtype) } EXPORT_SYMBOL(mtype_get_parent_dev); -/* Should be called holding parent_list_lock */ -static struct mdev_parent *__find_parent_device(struct device *dev) -{ - struct mdev_parent *parent; - - list_for_each_entry(parent, &parent_list, next) { - if (parent->dev == dev) - return parent; - } - return NULL; -} - -void mdev_release_parent(struct kref *kref) -{ - struct mdev_parent *parent = container_of(kref, struct mdev_parent, - ref); - struct device *dev = parent->dev; - - kfree(parent); - put_device(dev); -} - /* Caller must hold parent unreg_sem read or write lock */ static void mdev_device_remove_common(struct mdev_device *mdev) { @@ -105,125 +81,73 @@ static int mdev_device_remove_cb(struct device *dev, void *data) } /* - * mdev_register_device : Register a device + * mdev_register_parent: Register a device as parent for mdevs + * @parent: parent structure registered * @dev: device structure representing parent device. * @mdev_driver: Device driver to bind to the newly created mdev * - * Add device to list of registered parent devices. + * Registers the @parent stucture as a parent for mdev types and thus mdev + * devices. The caller needs to hold a reference on @dev that must not be + * released until after the call to mdev_unregister_parent(). + * * Returns a negative value on error, otherwise 0. */ -int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver) +int mdev_register_parent(struct mdev_parent *parent, struct device *dev, + struct mdev_driver *mdev_driver) { - int ret; - struct mdev_parent *parent; char *env_string = "MDEV_STATE=registered"; char *envp[] = { env_string, NULL }; + int ret; /* check for mandatory ops */ if (!mdev_driver->supported_type_groups) return -EINVAL; - dev = get_device(dev); - if (!dev) - return -EINVAL; - - mutex_lock(&parent_list_lock); - - /* Check for duplicate */ - parent = __find_parent_device(dev); - if (parent) { - parent = NULL; - ret = -EEXIST; - goto add_dev_err; - } - - parent = kzalloc(sizeof(*parent), GFP_KERNEL); - if (!parent) { - ret = -ENOMEM; - goto add_dev_err; - } - - kref_init(&parent->ref); + memset(parent, 0, sizeof(*parent)); init_rwsem(&parent->unreg_sem); - parent->dev = dev; parent->mdev_driver = mdev_driver; if (!mdev_bus_compat_class) { mdev_bus_compat_class = class_compat_register("mdev_bus"); - if (!mdev_bus_compat_class) { - ret = -ENOMEM; - goto add_dev_err; - } + if (!mdev_bus_compat_class) + return -ENOMEM; } ret = parent_create_sysfs_files(parent); if (ret) - goto add_dev_err; + return ret; ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL); if (ret) dev_warn(dev, "Failed to create compatibility class link\n"); - list_add(&parent->next, &parent_list); - mutex_unlock(&parent_list_lock); - dev_info(dev, "MDEV: Registered\n"); kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp); - return 0; - -add_dev_err: - mutex_unlock(&parent_list_lock); - if (parent) - mdev_put_parent(parent); - else - put_device(dev); - return ret; } -EXPORT_SYMBOL(mdev_register_device); +EXPORT_SYMBOL(mdev_register_parent); /* - * mdev_unregister_device : Unregister a parent device - * @dev: device structure representing parent device. - * - * Remove device from list of registered parent devices. Give a chance to free - * existing mediated devices for given device. + * mdev_unregister_parent : Unregister a parent device + * @parent: parent structure to unregister */ - -void mdev_unregister_device(struct device *dev) +void mdev_unregister_parent(struct mdev_parent *parent) { - struct mdev_parent *parent; char *env_string = "MDEV_STATE=unregistered"; char *envp[] = { env_string, NULL }; - mutex_lock(&parent_list_lock); - parent = __find_parent_device(dev); - - if (!parent) { - mutex_unlock(&parent_list_lock); - return; - } - dev_info(dev, "MDEV: Unregistering\n"); - - list_del(&parent->next); - mutex_unlock(&parent_list_lock); + dev_info(parent->dev, "MDEV: Unregistering\n"); down_write(&parent->unreg_sem); - - class_compat_remove_link(mdev_bus_compat_class, dev, NULL); - - device_for_each_child(dev, NULL, mdev_device_remove_cb); - + class_compat_remove_link(mdev_bus_compat_class, parent->dev, NULL); + device_for_each_child(parent->dev, NULL, mdev_device_remove_cb); parent_remove_sysfs_files(parent); up_write(&parent->unreg_sem); - mdev_put_parent(parent); - - /* We still have the caller's reference to use for the uevent */ - kobject_uevent_env(&dev->kobj, KOBJ_CHANGE, envp); + kobject_uevent_env(&parent->dev->kobj, KOBJ_CHANGE, envp); } -EXPORT_SYMBOL(mdev_unregister_device); +EXPORT_SYMBOL(mdev_unregister_parent); static void mdev_device_release(struct device *dev) { diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index 7c9fc79f3d83..297f911fdc89 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -13,17 +13,6 @@ int mdev_bus_register(void); void mdev_bus_unregister(void); -struct mdev_parent { - struct device *dev; - struct mdev_driver *mdev_driver; - struct kref ref; - struct list_head next; - struct kset *mdev_types_kset; - struct list_head type_list; - /* Synchronize device creation/removal with parent unregistration */ - struct rw_semaphore unreg_sem; -}; - struct mdev_type { struct kobject kobj; struct kobject *devices_kobj; @@ -48,16 +37,4 @@ void mdev_remove_sysfs_files(struct mdev_device *mdev); int mdev_device_create(struct mdev_type *kobj, const guid_t *uuid); int mdev_device_remove(struct mdev_device *dev); -void mdev_release_parent(struct kref *kref); - -static inline void mdev_get_parent(struct mdev_parent *parent) -{ - kref_get(&parent->ref); -} - -static inline void mdev_put_parent(struct mdev_parent *parent) -{ - kref_put(&parent->ref, mdev_release_parent); -} - #endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 4bfbf49aaa66..b71ffc559487 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -81,7 +81,7 @@ static void mdev_type_release(struct kobject *kobj) pr_debug("Releasing group %s\n", kobj->name); /* Pairs with the get in add_mdev_supported_type() */ - mdev_put_parent(type->parent); + put_device(type->parent->dev); kfree(type); } @@ -110,7 +110,7 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, type->kobj.kset = parent->mdev_types_kset; type->parent = parent; /* Pairs with the put in mdev_type_release() */ - mdev_get_parent(parent); + get_device(parent->dev); type->type_group_id = type_group_id; ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL, diff --git a/include/linux/mdev.h b/include/linux/mdev.h index a5d8ae6132a2..262512c2a8ff 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -23,6 +23,16 @@ struct mdev_device { bool active; }; +/* embedded into the struct device that the mdev devices hang off */ +struct mdev_parent { + struct device *dev; + struct mdev_driver *mdev_driver; + struct kset *mdev_types_kset; + struct list_head type_list; + /* Synchronize device creation/removal with parent unregistration */ + struct rw_semaphore unreg_sem; +}; + static inline struct mdev_device *to_mdev_device(struct device *dev) { return container_of(dev, struct mdev_device, dev); @@ -70,8 +80,9 @@ struct mdev_driver { extern struct bus_type mdev_bus_type; -int mdev_register_device(struct device *dev, struct mdev_driver *mdev_driver); -void mdev_unregister_device(struct device *dev); +int mdev_register_parent(struct mdev_parent *parent, struct device *dev, + struct mdev_driver *mdev_driver); +void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); void mdev_unregister_driver(struct mdev_driver *drv); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 985b6e713621..2c4791abbc3d 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -128,6 +128,7 @@ static dev_t mbochs_devt; static struct class *mbochs_class; static struct cdev mbochs_cdev; static struct device mbochs_dev; +static struct mdev_parent mbochs_parent; static atomic_t mbochs_avail_mbytes; static const struct vfio_device_ops mbochs_dev_ops; @@ -1475,7 +1476,7 @@ static int __init mbochs_dev_init(void) if (ret) goto err_class; - ret = mdev_register_device(&mbochs_dev, &mbochs_driver); + ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver); if (ret) goto err_device; @@ -1496,7 +1497,7 @@ static int __init mbochs_dev_init(void) static void __exit mbochs_dev_exit(void) { mbochs_dev.bus = NULL; - mdev_unregister_device(&mbochs_dev); + mdev_unregister_parent(&mbochs_parent); device_unregister(&mbochs_dev); mdev_unregister_driver(&mbochs_driver); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 1daab012b5d8..01f345430b97 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -83,6 +83,7 @@ static dev_t mdpy_devt; static struct class *mdpy_class; static struct cdev mdpy_cdev; static struct device mdpy_dev; +static struct mdev_parent mdpy_parent; static u32 mdpy_count; static const struct vfio_device_ops mdpy_dev_ops; @@ -778,7 +779,7 @@ static int __init mdpy_dev_init(void) if (ret) goto err_class; - ret = mdev_register_device(&mdpy_dev, &mdpy_driver); + ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver); if (ret) goto err_device; @@ -799,7 +800,7 @@ static int __init mdpy_dev_init(void) static void __exit mdpy_dev_exit(void) { mdpy_dev.bus = NULL; - mdev_unregister_device(&mdpy_dev); + mdev_unregister_parent(&mdpy_parent); device_unregister(&mdpy_dev); mdev_unregister_driver(&mdpy_driver); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 86843ce3d9a2..e80baac51381 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -72,6 +72,7 @@ static struct mtty_dev { struct cdev vd_cdev; struct idr vd_idr; struct device dev; + struct mdev_parent parent; } mtty_dev; struct mdev_region_info { @@ -1361,7 +1362,8 @@ static int __init mtty_dev_init(void) if (ret) goto err_class; - ret = mdev_register_device(&mtty_dev.dev, &mtty_driver); + ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, + &mtty_driver); if (ret) goto err_device; return 0; @@ -1381,7 +1383,7 @@ static int __init mtty_dev_init(void) static void __exit mtty_dev_exit(void) { mtty_dev.dev.bus = NULL; - mdev_unregister_device(&mtty_dev.dev); + mdev_unregister_parent(&mtty_dev.parent); device_unregister(&mtty_dev.dev); idr_destroy(&mtty_dev.vd_idr); From da44c340c4fe9d9653ae84fa6a60f406bafcffce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:43 +0200 Subject: [PATCH 62/77] vfio/mdev: simplify mdev_type handling Instead of abusing struct attribute_group to control initialization of struct mdev_type, just define the actual attributes in the mdev_driver, allocate the mdev_type structures in the caller and pass them to mdev_register_parent. This allows the caller to use container_of to get at the containing structure and thus significantly simplify the code. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-6-hch@lst.de Signed-off-by: Alex Williamson --- .../driver-api/vfio-mediated-device.rst | 2 +- drivers/gpu/drm/i915/gvt/gvt.h | 3 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 102 +++--------------- drivers/gpu/drm/i915/gvt/vgpu.c | 13 ++- drivers/s390/cio/vfio_ccw_drv.c | 6 +- drivers/s390/cio/vfio_ccw_ops.c | 14 +-- drivers/s390/cio/vfio_ccw_private.h | 2 + drivers/s390/crypto/vfio_ap_ops.c | 19 ++-- drivers/s390/crypto/vfio_ap_private.h | 2 + drivers/vfio/mdev/mdev_core.c | 31 ++---- drivers/vfio/mdev/mdev_driver.c | 5 +- drivers/vfio/mdev/mdev_private.h | 8 -- drivers/vfio/mdev/mdev_sysfs.c | 91 ++++------------ include/linux/mdev.h | 26 +++-- samples/vfio-mdev/mbochs.c | 57 ++++------ samples/vfio-mdev/mdpy.c | 50 ++++----- samples/vfio-mdev/mtty.c | 60 +++++------ 17 files changed, 165 insertions(+), 326 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index cd1667608ab5..ff7342d2e332 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -103,7 +103,7 @@ structure to represent a mediated device's driver:: struct mdev_driver { int (*probe) (struct mdev_device *dev); void (*remove) (struct mdev_device *dev); - struct attribute_group **supported_type_groups; + const struct attribute * const *types_attrs; struct device_driver driver; }; diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index fa4a56b50c82..db182066d56c 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -310,8 +310,8 @@ struct intel_vgpu_config { const char *name; }; -#define NR_MAX_INTEL_VGPU_TYPES 20 struct intel_vgpu_type { + struct mdev_type type; char name[16]; const struct intel_vgpu_config *conf; unsigned int avail_instance; @@ -339,6 +339,7 @@ struct intel_gvt { struct notifier_block shadow_ctx_notifier_block[I915_NUM_ENGINES]; DECLARE_HASHTABLE(cmd_table, GVT_CMD_HASH_BITS); struct mdev_parent parent; + struct mdev_type **mdev_types; struct intel_vgpu_type *types; unsigned int num_types; struct intel_vgpu *idle_vgpu; diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index d7afe3f5f75b..12b0b3306168 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -117,17 +117,10 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct intel_vgpu_type *type; - unsigned int num = 0; - struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt; + struct intel_vgpu_type *type = + container_of(mtype, struct intel_vgpu_type, type); - type = &gvt->types[mtype_get_type_group_id(mtype)]; - if (!type) - num = 0; - else - num = type->avail_instance; - - return sprintf(buf, "%u\n", num); + return sprintf(buf, "%u\n", type->avail_instance); } static ssize_t device_api_show(struct mdev_type *mtype, @@ -139,12 +132,8 @@ static ssize_t device_api_show(struct mdev_type *mtype, static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct intel_vgpu_type *type; - struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt; - - type = &gvt->types[mtype_get_type_group_id(mtype)]; - if (!type) - return 0; + struct intel_vgpu_type *type = + container_of(mtype, struct intel_vgpu_type, type); return sprintf(buf, "low_gm_size: %dMB\nhigh_gm_size: %dMB\n" "fence: %d\nresolution: %s\n" @@ -158,14 +147,7 @@ static ssize_t description_show(struct mdev_type *mtype, static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct intel_vgpu_type *type; - struct intel_gvt *gvt = kdev_to_i915(mtype_get_parent_dev(mtype))->gvt; - - type = &gvt->types[mtype_get_type_group_id(mtype)]; - if (!type) - return 0; - - return sprintf(buf, "%s\n", type->name); + return sprintf(buf, "%s\n", mtype->sysfs_name); } static MDEV_TYPE_ATTR_RO(available_instances); @@ -173,7 +155,7 @@ static MDEV_TYPE_ATTR_RO(device_api); static MDEV_TYPE_ATTR_RO(description); static MDEV_TYPE_ATTR_RO(name); -static struct attribute *gvt_type_attrs[] = { +static const struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_description.attr, @@ -181,51 +163,6 @@ static struct attribute *gvt_type_attrs[] = { NULL, }; -static struct attribute_group *gvt_vgpu_type_groups[] = { - [0 ... NR_MAX_INTEL_VGPU_TYPES - 1] = NULL, -}; - -static int intel_gvt_init_vgpu_type_groups(struct intel_gvt *gvt) -{ - int i, j; - struct intel_vgpu_type *type; - struct attribute_group *group; - - for (i = 0; i < gvt->num_types; i++) { - type = &gvt->types[i]; - - group = kzalloc(sizeof(struct attribute_group), GFP_KERNEL); - if (!group) - goto unwind; - - group->name = type->name; - group->attrs = gvt_type_attrs; - gvt_vgpu_type_groups[i] = group; - } - - return 0; - -unwind: - for (j = 0; j < i; j++) { - group = gvt_vgpu_type_groups[j]; - kfree(group); - } - - return -ENOMEM; -} - -static void intel_gvt_cleanup_vgpu_type_groups(struct intel_gvt *gvt) -{ - int i; - struct attribute_group *group; - - for (i = 0; i < gvt->num_types; i++) { - group = gvt_vgpu_type_groups[i]; - gvt_vgpu_type_groups[i] = NULL; - kfree(group); - } -} - static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long size) { @@ -1547,16 +1484,11 @@ static const struct attribute_group *intel_vgpu_groups[] = { static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) { struct mdev_device *mdev = to_mdev_device(vfio_dev->dev); - struct device *pdev = mdev_parent_dev(mdev); - struct intel_gvt *gvt = kdev_to_i915(pdev)->gvt; - struct intel_vgpu_type *type; struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); + struct intel_vgpu_type *type = + container_of(mdev->type, struct intel_vgpu_type, type); - type = &gvt->types[mdev_get_type_group_id(mdev)]; - if (!type) - return -EINVAL; - - vgpu->gvt = gvt; + vgpu->gvt = kdev_to_i915(mdev_parent_dev(mdev))->gvt; return intel_gvt_create_vgpu(vgpu, type->conf); } @@ -1625,7 +1557,7 @@ static struct mdev_driver intel_vgpu_mdev_driver = { }, .probe = intel_vgpu_probe, .remove = intel_vgpu_remove, - .supported_type_groups = gvt_vgpu_type_groups, + .types_attrs = gvt_type_attrs, }; int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn) @@ -1924,7 +1856,6 @@ static void intel_gvt_clean_device(struct drm_i915_private *i915) return; mdev_unregister_parent(&gvt->parent); - intel_gvt_cleanup_vgpu_type_groups(gvt); intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu); intel_gvt_clean_vgpu_types(gvt); @@ -2024,20 +1955,15 @@ static int intel_gvt_init_device(struct drm_i915_private *i915) intel_gvt_debugfs_init(gvt); - ret = intel_gvt_init_vgpu_type_groups(gvt); + ret = mdev_register_parent(&gvt->parent, i915->drm.dev, + &intel_vgpu_mdev_driver, + gvt->mdev_types, gvt->num_types); if (ret) goto out_destroy_idle_vgpu; - ret = mdev_register_parent(&gvt->parent, i915->drm.dev, - &intel_vgpu_mdev_driver); - if (ret) - goto out_cleanup_vgpu_type_groups; - gvt_dbg_core("gvt device initialization is done\n"); return 0; -out_cleanup_vgpu_type_groups: - intel_gvt_cleanup_vgpu_type_groups(gvt); out_destroy_idle_vgpu: intel_gvt_destroy_idle_vgpu(gvt->idle_vgpu); intel_gvt_debugfs_clean(gvt); diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index b0d5dafd013f..92aaa77fecee 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -113,13 +113,18 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) if (!gvt->types) return -ENOMEM; + gvt->mdev_types = kcalloc(num_types, sizeof(*gvt->mdev_types), + GFP_KERNEL); + if (!gvt->mdev_types) + goto out_free_types; + for (i = 0; i < num_types; ++i) { const struct intel_vgpu_config *conf = &intel_vgpu_configs[i]; if (low_avail / conf->low_mm == 0) break; if (conf->weight < 1 || conf->weight > VGPU_MAX_WEIGHT) - goto out_free_types; + goto out_free_mdev_types; sprintf(gvt->types[i].name, "GVTg_V%u_%s", GRAPHICS_VER(gvt->gt->i915) == 8 ? 4 : 5, conf->name); @@ -131,11 +136,16 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) i, gvt->types[i].name, gvt->types[i].avail_instance, conf->low_mm, conf->high_mm, conf->fence, conf->weight, vgpu_edid_str(conf->edid)); + + gvt->mdev_types[i] = &gvt->types[i].type; + gvt->mdev_types[i]->sysfs_name = gvt->types[i].name; } gvt->num_types = i; return 0; +out_free_mdev_types: + kfree(gvt->mdev_types); out_free_types: kfree(gvt->types); return -EINVAL; @@ -143,6 +153,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt) { + kfree(gvt->mdev_types); kfree(gvt->types); } diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 7d105915bd14..25a5de08b390 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -202,7 +202,6 @@ static void vfio_ccw_free_private(struct vfio_ccw_private *private) mutex_destroy(&private->io_mutex); kfree(private); } - static int vfio_ccw_sch_probe(struct subchannel *sch) { struct pmcw *pmcw = &sch->schib.pmcw; @@ -221,8 +220,11 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) dev_set_drvdata(&sch->dev, private); + private->mdev_type.sysfs_name = "io"; + private->mdev_types[0] = &private->mdev_type; ret = mdev_register_parent(&private->parent, &sch->dev, - &vfio_ccw_mdev_driver); + &vfio_ccw_mdev_driver, + private->mdev_types, 1); if (ret) goto out_free; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 9a0e0c5ffb1a..c37e712a4b06 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -69,23 +69,13 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; -static struct attribute_group mdev_type_group = { - .name = "io", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group, - NULL, -}; - static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) { struct vfio_ccw_private *private = @@ -646,5 +636,5 @@ struct mdev_driver vfio_ccw_mdev_driver = { }, .probe = vfio_ccw_mdev_probe, .remove = vfio_ccw_mdev_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 1a4bfb1b5a80..52caa721ec06 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -120,6 +120,8 @@ struct vfio_ccw_private { struct completion release_comp; struct mdev_parent parent; + struct mdev_type mdev_type; + struct mdev_type *mdev_types[1]; } __aligned(8); int vfio_ccw_sch_quiesce(struct subchannel *sch); diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 724d09a74a8f..24d131c502ca 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -816,23 +816,13 @@ static ssize_t device_api_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *vfio_ap_mdev_type_attrs[] = { +static const struct attribute *vfio_ap_mdev_type_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; -static struct attribute_group vfio_ap_mdev_hwvirt_type_group = { - .name = VFIO_AP_MDEV_TYPE_HWVIRT, - .attrs = vfio_ap_mdev_type_attrs, -}; - -static struct attribute_group *vfio_ap_mdev_type_groups[] = { - &vfio_ap_mdev_hwvirt_type_group, - NULL, -}; - #define MDEV_SHARING_ERR "Userspace may not re-assign queue %02lx.%04lx " \ "already assigned to %s" @@ -1817,7 +1807,7 @@ static struct mdev_driver vfio_ap_matrix_driver = { }, .probe = vfio_ap_mdev_probe, .remove = vfio_ap_mdev_remove, - .supported_type_groups = vfio_ap_mdev_type_groups, + .types_attrs = vfio_ap_mdev_type_attrs, }; int vfio_ap_mdev_register(void) @@ -1830,8 +1820,11 @@ int vfio_ap_mdev_register(void) if (ret) return ret; + matrix_dev->mdev_type.sysfs_name = VFIO_AP_MDEV_TYPE_HWVIRT; + matrix_dev->mdev_types[0] = &matrix_dev->mdev_type; ret = mdev_register_parent(&matrix_dev->parent, &matrix_dev->device, - &vfio_ap_matrix_driver); + &vfio_ap_matrix_driver, + matrix_dev->mdev_types, 1); if (ret) goto err_driver; return 0; diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 35165730f517..441dc8dda380 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -53,6 +53,8 @@ struct ap_matrix_dev { struct ap_driver *vfio_ap_drv; struct mutex guests_lock; /* serializes access to each KVM guest */ struct mdev_parent parent; + struct mdev_type mdev_type; + struct mdev_type *mdev_types[]; }; extern struct ap_matrix_dev *matrix_dev; diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index fa05ac339695..2d95a497fd3b 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -29,26 +29,6 @@ struct device *mdev_parent_dev(struct mdev_device *mdev) } EXPORT_SYMBOL(mdev_parent_dev); -/* - * Return the index in supported_type_groups that this mdev_device was created - * from. - */ -unsigned int mdev_get_type_group_id(struct mdev_device *mdev) -{ - return mdev->type->type_group_id; -} -EXPORT_SYMBOL(mdev_get_type_group_id); - -/* - * Used in mdev_type_attribute sysfs functions to return the index in the - * supported_type_groups that the sysfs is called from. - */ -unsigned int mtype_get_type_group_id(struct mdev_type *mtype) -{ - return mtype->type_group_id; -} -EXPORT_SYMBOL(mtype_get_type_group_id); - /* * Used in mdev_type_attribute sysfs functions to return the parent struct * device @@ -85,6 +65,8 @@ static int mdev_device_remove_cb(struct device *dev, void *data) * @parent: parent structure registered * @dev: device structure representing parent device. * @mdev_driver: Device driver to bind to the newly created mdev + * @types: Array of supported mdev types + * @nr_types: Number of entries in @types * * Registers the @parent stucture as a parent for mdev types and thus mdev * devices. The caller needs to hold a reference on @dev that must not be @@ -93,20 +75,19 @@ static int mdev_device_remove_cb(struct device *dev, void *data) * Returns a negative value on error, otherwise 0. */ int mdev_register_parent(struct mdev_parent *parent, struct device *dev, - struct mdev_driver *mdev_driver) + struct mdev_driver *mdev_driver, struct mdev_type **types, + unsigned int nr_types) { char *env_string = "MDEV_STATE=registered"; char *envp[] = { env_string, NULL }; int ret; - /* check for mandatory ops */ - if (!mdev_driver->supported_type_groups) - return -EINVAL; - memset(parent, 0, sizeof(*parent)); init_rwsem(&parent->unreg_sem); parent->dev = dev; parent->mdev_driver = mdev_driver; + parent->types = types; + parent->nr_types = nr_types; if (!mdev_bus_compat_class) { mdev_bus_compat_class = class_compat_register("mdev_bus"); diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 7bd4bb9850e8..1da1ecf76a0d 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -56,10 +56,9 @@ EXPORT_SYMBOL_GPL(mdev_bus_type); **/ int mdev_register_driver(struct mdev_driver *drv) { - /* initialize common driver fields */ + if (!drv->types_attrs) + return -EINVAL; drv->driver.bus = &mdev_bus_type; - - /* register with core */ return driver_register(&drv->driver); } EXPORT_SYMBOL(mdev_register_driver); diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index 297f911fdc89..ba1b2dbddc0b 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -13,14 +13,6 @@ int mdev_bus_register(void); void mdev_bus_unregister(void); -struct mdev_type { - struct kobject kobj; - struct kobject *devices_kobj; - struct mdev_parent *parent; - struct list_head next; - unsigned int type_group_id; -}; - extern const struct attribute_group *mdev_device_groups[]; #define to_mdev_type_attr(_attr) \ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index b71ffc559487..38b4c2466ec4 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -82,7 +82,6 @@ static void mdev_type_release(struct kobject *kobj) pr_debug("Releasing group %s\n", kobj->name); /* Pairs with the get in add_mdev_supported_type() */ put_device(type->parent->dev); - kfree(type); } static struct kobj_type mdev_type_ktype = { @@ -90,35 +89,21 @@ static struct kobj_type mdev_type_ktype = { .release = mdev_type_release, }; -static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, - unsigned int type_group_id) +static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) { - struct mdev_type *type; - struct attribute_group *group = - parent->mdev_driver->supported_type_groups[type_group_id]; int ret; - if (!group->name) { - pr_err("%s: Type name empty!\n", __func__); - return ERR_PTR(-EINVAL); - } - - type = kzalloc(sizeof(*type), GFP_KERNEL); - if (!type) - return ERR_PTR(-ENOMEM); - type->kobj.kset = parent->mdev_types_kset; type->parent = parent; /* Pairs with the put in mdev_type_release() */ get_device(parent->dev); - type->type_group_id = type_group_id; ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL, "%s-%s", dev_driver_string(parent->dev), - group->name); + type->sysfs_name); if (ret) { kobject_put(&type->kobj); - return ERR_PTR(ret); + return ret; } ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr); @@ -131,13 +116,10 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, goto attr_devices_failed; } - ret = sysfs_create_files(&type->kobj, - (const struct attribute **)group->attrs); - if (ret) { - ret = -ENOMEM; + ret = sysfs_create_files(&type->kobj, parent->mdev_driver->types_attrs); + if (ret) goto attrs_failed; - } - return type; + return 0; attrs_failed: kobject_put(type->devices_kobj); @@ -146,78 +128,49 @@ static struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, attr_create_failed: kobject_del(&type->kobj); kobject_put(&type->kobj); - return ERR_PTR(ret); + return ret; } -static void remove_mdev_supported_type(struct mdev_type *type) +static void mdev_type_remove(struct mdev_type *type) { - struct attribute_group *group = - type->parent->mdev_driver->supported_type_groups[type->type_group_id]; + sysfs_remove_files(&type->kobj, type->parent->mdev_driver->types_attrs); - sysfs_remove_files(&type->kobj, - (const struct attribute **)group->attrs); kobject_put(type->devices_kobj); sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); kobject_del(&type->kobj); kobject_put(&type->kobj); } -static int add_mdev_supported_type_groups(struct mdev_parent *parent) -{ - int i; - - for (i = 0; parent->mdev_driver->supported_type_groups[i]; i++) { - struct mdev_type *type; - - type = add_mdev_supported_type(parent, i); - if (IS_ERR(type)) { - struct mdev_type *ltype, *tmp; - - list_for_each_entry_safe(ltype, tmp, &parent->type_list, - next) { - list_del(<ype->next); - remove_mdev_supported_type(ltype); - } - return PTR_ERR(type); - } - list_add(&type->next, &parent->type_list); - } - return 0; -} - /* mdev sysfs functions */ void parent_remove_sysfs_files(struct mdev_parent *parent) { - struct mdev_type *type, *tmp; - - list_for_each_entry_safe(type, tmp, &parent->type_list, next) { - list_del(&type->next); - remove_mdev_supported_type(type); - } + int i; + for (i = 0; i < parent->nr_types; i++) + mdev_type_remove(parent->types[i]); kset_unregister(parent->mdev_types_kset); } int parent_create_sysfs_files(struct mdev_parent *parent) { - int ret; + int ret, i; parent->mdev_types_kset = kset_create_and_add("mdev_supported_types", NULL, &parent->dev->kobj); - if (!parent->mdev_types_kset) return -ENOMEM; - INIT_LIST_HEAD(&parent->type_list); - - ret = add_mdev_supported_type_groups(parent); - if (ret) - goto create_err; + for (i = 0; i < parent->nr_types; i++) { + ret = mdev_type_add(parent, parent->types[i]); + if (ret) + goto out_err; + } return 0; -create_err: - kset_unregister(parent->mdev_types_kset); - return ret; +out_err: + while (--i >= 0) + mdev_type_remove(parent->types[i]); + return 0; } static ssize_t remove_store(struct device *dev, struct device_attribute *attr, diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 262512c2a8ff..19bc93c10e8c 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -23,14 +23,27 @@ struct mdev_device { bool active; }; +struct mdev_type { + /* set by the driver before calling mdev_register parent: */ + const char *sysfs_name; + + /* set by the core, can be used drivers */ + struct mdev_parent *parent; + + /* internal only */ + struct kobject kobj; + struct kobject *devices_kobj; +}; + /* embedded into the struct device that the mdev devices hang off */ struct mdev_parent { struct device *dev; struct mdev_driver *mdev_driver; struct kset *mdev_types_kset; - struct list_head type_list; /* Synchronize device creation/removal with parent unregistration */ struct rw_semaphore unreg_sem; + struct mdev_type **types; + unsigned int nr_types; }; static inline struct mdev_device *to_mdev_device(struct device *dev) @@ -38,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -unsigned int mdev_get_type_group_id(struct mdev_device *mdev); -unsigned int mtype_get_type_group_id(struct mdev_type *mtype); struct device *mtype_get_parent_dev(struct mdev_type *mtype); /* interface for exporting mdev supported type attributes */ @@ -66,22 +77,21 @@ struct mdev_type_attribute mdev_type_attr_##_name = \ * struct mdev_driver - Mediated device driver * @probe: called when new device created * @remove: called when device removed - * @supported_type_groups: Attributes to define supported types. It is mandatory - * to provide supported types. + * @types_attrs: attributes to the type kobjects. * @driver: device driver structure - * **/ struct mdev_driver { int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); - struct attribute_group **supported_type_groups; + const struct attribute * const *types_attrs; struct device_driver driver; }; extern struct bus_type mdev_bus_type; int mdev_register_parent(struct mdev_parent *parent, struct device *dev, - struct mdev_driver *mdev_driver); + struct mdev_driver *mdev_driver, struct mdev_type **types, + unsigned int nr_types); void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 2c4791abbc3d..4d0839cb5194 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -99,23 +99,27 @@ MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); #define MBOCHS_TYPE_2 "medium" #define MBOCHS_TYPE_3 "large" -static const struct mbochs_type { +static struct mbochs_type { + struct mdev_type type; const char *name; u32 mbytes; u32 max_x; u32 max_y; } mbochs_types[] = { { + .type.sysfs_name = MBOCHS_TYPE_1, .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, .mbytes = 4, .max_x = 800, .max_y = 600, }, { + .type.sysfs_name = MBOCHS_TYPE_2, .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, .mbytes = 16, .max_x = 1920, .max_y = 1440, }, { + .type.sysfs_name = MBOCHS_TYPE_3, .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, .mbytes = 64, .max_x = 0, @@ -123,6 +127,11 @@ static const struct mbochs_type { }, }; +static struct mdev_type *mbochs_mdev_types[] = { + &mbochs_types[0].type, + &mbochs_types[1].type, + &mbochs_types[2].type, +}; static dev_t mbochs_devt; static struct class *mbochs_class; @@ -510,8 +519,8 @@ static int mbochs_init_dev(struct vfio_device *vdev) struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); struct mdev_device *mdev = to_mdev_device(vdev->dev); - const struct mbochs_type *type = - &mbochs_types[mdev_get_type_group_id(mdev)]; + struct mbochs_type *type = + container_of(mdev->type, struct mbochs_type, type); int avail_mbytes = atomic_read(&mbochs_avail_mbytes); int ret = -ENOMEM; @@ -1345,8 +1354,8 @@ static const struct attribute_group *mdev_dev_groups[] = { static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mbochs_type *type = - &mbochs_types[mtype_get_type_group_id(mtype)]; + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); return sprintf(buf, "%s\n", type->name); } @@ -1355,8 +1364,8 @@ static MDEV_TYPE_ATTR_RO(name); static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mbochs_type *type = - &mbochs_types[mtype_get_type_group_id(mtype)]; + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); return sprintf(buf, "virtual display, %d MB video memory\n", type ? type->mbytes : 0); @@ -1367,8 +1376,8 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mbochs_type *type = - &mbochs_types[mtype_get_type_group_id(mtype)]; + struct mbochs_type *type = + container_of(mtype, struct mbochs_type, type); int count = atomic_read(&mbochs_avail_mbytes) / type->mbytes; return sprintf(buf, "%d\n", count); @@ -1382,7 +1391,7 @@ static ssize_t device_api_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_device_api.attr, @@ -1390,28 +1399,6 @@ static struct attribute *mdev_types_attrs[] = { NULL, }; -static struct attribute_group mdev_type_group1 = { - .name = MBOCHS_TYPE_1, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = MBOCHS_TYPE_2, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group3 = { - .name = MBOCHS_TYPE_3, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - &mdev_type_group3, - NULL, -}; - static const struct vfio_device_ops mbochs_dev_ops = { .close_device = mbochs_close_device, .init = mbochs_init_dev, @@ -1431,7 +1418,7 @@ static struct mdev_driver mbochs_driver = { }, .probe = mbochs_probe, .remove = mbochs_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; static const struct file_operations vd_fops = { @@ -1476,7 +1463,9 @@ static int __init mbochs_dev_init(void) if (ret) goto err_class; - ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver); + ret = mdev_register_parent(&mbochs_parent, &mbochs_dev, &mbochs_driver, + mbochs_mdev_types, + ARRAY_SIZE(mbochs_mdev_types)); if (ret) goto err_device; diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 01f345430b97..4a341f4849e7 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -51,7 +51,8 @@ MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); #define MDPY_TYPE_2 "xga" #define MDPY_TYPE_3 "hd" -static const struct mdpy_type { +static struct mdpy_type { + struct mdev_type type; const char *name; u32 format; u32 bytepp; @@ -59,18 +60,21 @@ static const struct mdpy_type { u32 height; } mdpy_types[] = { { + .type.sysfs_name = MDPY_TYPE_1, .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 640, .height = 480, }, { + .type.sysfs_name = MDPY_TYPE_2, .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1024, .height = 768, }, { + .type.sysfs_name = MDPY_TYPE_3, .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, @@ -79,6 +83,12 @@ static const struct mdpy_type { }, }; +static struct mdev_type *mdpy_mdev_types[] = { + &mdpy_types[0].type, + &mdpy_types[1].type, + &mdpy_types[2].type, +}; + static dev_t mdpy_devt; static struct class *mdpy_class; static struct cdev mdpy_cdev; @@ -222,7 +232,7 @@ static int mdpy_init_dev(struct vfio_device *vdev) container_of(vdev, struct mdev_state, vdev); struct mdev_device *mdev = to_mdev_device(vdev->dev); const struct mdpy_type *type = - &mdpy_types[mdev_get_type_group_id(mdev)]; + container_of(mdev->type, struct mdpy_type, type); u32 fbsize; int ret = -ENOMEM; @@ -655,8 +665,7 @@ static const struct attribute_group *mdev_dev_groups[] = { static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mdpy_type *type = - &mdpy_types[mtype_get_type_group_id(mtype)]; + struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); return sprintf(buf, "%s\n", type->name); } @@ -665,8 +674,7 @@ static MDEV_TYPE_ATTR_RO(name); static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - const struct mdpy_type *type = - &mdpy_types[mtype_get_type_group_id(mtype)]; + struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); return sprintf(buf, "virtual display, %dx%d framebuffer\n", type->width, type->height); @@ -688,7 +696,7 @@ static ssize_t device_api_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_device_api.attr, @@ -696,28 +704,6 @@ static struct attribute *mdev_types_attrs[] = { NULL, }; -static struct attribute_group mdev_type_group1 = { - .name = MDPY_TYPE_1, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = MDPY_TYPE_2, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group3 = { - .name = MDPY_TYPE_3, - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - &mdev_type_group3, - NULL, -}; - static const struct vfio_device_ops mdpy_dev_ops = { .init = mdpy_init_dev, .release = mdpy_release_dev, @@ -736,7 +722,7 @@ static struct mdev_driver mdpy_driver = { }, .probe = mdpy_probe, .remove = mdpy_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; static const struct file_operations vd_fops = { @@ -779,7 +765,9 @@ static int __init mdpy_dev_init(void) if (ret) goto err_class; - ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver); + ret = mdev_register_parent(&mdpy_parent, &mdpy_dev, &mdpy_driver, + mdpy_mdev_types, + ARRAY_SIZE(mdpy_mdev_types)); if (ret) goto err_device; diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index e80baac51381..814a7f98738a 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -143,6 +143,20 @@ struct mdev_state { int nr_ports; }; +static struct mtty_type { + struct mdev_type type; + int nr_ports; + const char *name; +} mtty_types[2] = { + { .nr_ports = 1, .type.sysfs_name = "1", .name = "Single port serial" }, + { .nr_ports = 2, .type.sysfs_name = "2", .name = "Dual port serial" }, +}; + +static struct mdev_type *mtty_mdev_types[] = { + &mtty_types[0].type, + &mtty_types[1].type, +}; + static atomic_t mdev_avail_ports = ATOMIC_INIT(MAX_MTTYS); static const struct file_operations vd_fops = { @@ -707,17 +721,19 @@ static int mtty_init_dev(struct vfio_device *vdev) struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); struct mdev_device *mdev = to_mdev_device(vdev->dev); - int nr_ports = mdev_get_type_group_id(mdev) + 1; + struct mtty_type *type = + container_of(mdev->type, struct mtty_type, type); int avail_ports = atomic_read(&mdev_avail_ports); int ret; do { - if (avail_ports < nr_ports) + if (avail_ports < type->nr_ports) return -ENOSPC; } while (!atomic_try_cmpxchg(&mdev_avail_ports, - &avail_ports, avail_ports - nr_ports)); + &avail_ports, + avail_ports - type->nr_ports)); - mdev_state->nr_ports = nr_ports; + mdev_state->nr_ports = type->nr_ports; mdev_state->irq_index = -1; mdev_state->s[0].max_fifo_size = MAX_FIFO_SIZE; mdev_state->s[1].max_fifo_size = MAX_FIFO_SIZE; @@ -735,7 +751,7 @@ static int mtty_init_dev(struct vfio_device *vdev) return 0; err_nr_ports: - atomic_add(nr_ports, &mdev_avail_ports); + atomic_add(type->nr_ports, &mdev_avail_ports); return ret; } @@ -1242,11 +1258,9 @@ static const struct attribute_group *mdev_dev_groups[] = { static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - static const char *name_str[2] = { "Single port serial", - "Dual port serial" }; + struct mtty_type *type = container_of(mtype, struct mtty_type, type); - return sysfs_emit(buf, "%s\n", - name_str[mtype_get_type_group_id(mtype)]); + return sysfs_emit(buf, "%s\n", type->name); } static MDEV_TYPE_ATTR_RO(name); @@ -1255,9 +1269,10 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - unsigned int ports = mtype_get_type_group_id(mtype) + 1; + struct mtty_type *type = container_of(mtype, struct mtty_type, type); - return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / ports); + return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / + type->nr_ports); } static MDEV_TYPE_ATTR_RO(available_instances); @@ -1270,29 +1285,13 @@ static ssize_t device_api_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(device_api); -static struct attribute *mdev_types_attrs[] = { +static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; -static struct attribute_group mdev_type_group1 = { - .name = "1", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group mdev_type_group2 = { - .name = "2", - .attrs = mdev_types_attrs, -}; - -static struct attribute_group *mdev_type_groups[] = { - &mdev_type_group1, - &mdev_type_group2, - NULL, -}; - static const struct vfio_device_ops mtty_dev_ops = { .name = "vfio-mtty", .init = mtty_init_dev, @@ -1311,7 +1310,7 @@ static struct mdev_driver mtty_driver = { }, .probe = mtty_probe, .remove = mtty_remove, - .supported_type_groups = mdev_type_groups, + .types_attrs = mdev_types_attrs, }; static void mtty_device_release(struct device *dev) @@ -1363,7 +1362,8 @@ static int __init mtty_dev_init(void) goto err_class; ret = mdev_register_parent(&mtty_dev.parent, &mtty_dev.dev, - &mtty_driver); + &mtty_driver, mtty_mdev_types, + ARRAY_SIZE(mtty_mdev_types)); if (ret) goto err_device; return 0; From cbf3bb28aaeaee425ca7b9c537a3efff1f8c98ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:44 +0200 Subject: [PATCH 63/77] vfio/mdev: remove mdev_from_dev Just open code it in the only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-7-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/mdev/mdev_core.c | 6 ++---- include/linux/mdev.h | 4 ---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 2d95a497fd3b..bde7ce620dae 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -53,10 +53,8 @@ static void mdev_device_remove_common(struct mdev_device *mdev) static int mdev_device_remove_cb(struct device *dev, void *data) { - struct mdev_device *mdev = mdev_from_dev(dev); - - if (mdev) - mdev_device_remove_common(mdev); + if (dev->bus == &mdev_bus_type) + mdev_device_remove_common(to_mdev_device(dev)); return 0; } diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 19bc93c10e8c..4f558de52fd9 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -102,9 +102,5 @@ static inline struct device *mdev_dev(struct mdev_device *mdev) { return &mdev->dev; } -static inline struct mdev_device *mdev_from_dev(struct device *dev) -{ - return dev->bus == &mdev_bus_type ? to_mdev_device(dev) : NULL; -} #endif /* MDEV_H */ From 2815fe149ffa8e1a022b2830ab62999135c00a4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:45 +0200 Subject: [PATCH 64/77] vfio/mdev: unexport mdev_bus_type mdev_bus_type is only used in mdev.ko now, so unexport it. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-8-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/mdev/mdev_driver.c | 1 - drivers/vfio/mdev/mdev_private.h | 1 + include/linux/mdev.h | 2 -- 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 1da1ecf76a0d..5b3c94f4fb13 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -46,7 +46,6 @@ struct bus_type mdev_bus_type = { .remove = mdev_remove, .match = mdev_match, }; -EXPORT_SYMBOL_GPL(mdev_bus_type); /** * mdev_register_driver - register a new MDEV driver diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index ba1b2dbddc0b..af457b27f607 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -13,6 +13,7 @@ int mdev_bus_register(void); void mdev_bus_unregister(void); +extern struct bus_type mdev_bus_type; extern const struct attribute_group *mdev_device_groups[]; #define to_mdev_type_attr(_attr) \ diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 4f558de52fd9..6c179d2b8927 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -87,8 +87,6 @@ struct mdev_driver { struct device_driver driver; }; -extern struct bus_type mdev_bus_type; - int mdev_register_parent(struct mdev_parent *parent, struct device *dev, struct mdev_driver *mdev_driver, struct mdev_type **types, unsigned int nr_types); From 062e720cd209d8091c4f3d118d93973f02209aca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:46 +0200 Subject: [PATCH 65/77] vfio/mdev: remove mdev_parent_dev Just open code the dereferences in the only user. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-9-hch@lst.de Signed-off-by: Alex Williamson --- Documentation/driver-api/vfio-mediated-device.rst | 3 --- drivers/gpu/drm/i915/gvt/kvmgt.c | 2 +- drivers/vfio/mdev/mdev_core.c | 6 ------ include/linux/mdev.h | 1 - 4 files changed, 1 insertion(+), 11 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index ff7342d2e332..7b660f3fa2c9 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -200,9 +200,6 @@ Directories and files under the sysfs for Each Physical Device sprintf(buf, "%s-%s", dev_driver_string(parent->dev), group->name); - (or using mdev_parent_dev(mdev) to arrive at the parent device outside - of the core mdev code) - * device_api This attribute should show which device API is being created, for example, diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 12b0b3306168..2265dd867956 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1488,7 +1488,7 @@ static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) struct intel_vgpu_type *type = container_of(mdev->type, struct intel_vgpu_type, type); - vgpu->gvt = kdev_to_i915(mdev_parent_dev(mdev))->gvt; + vgpu->gvt = kdev_to_i915(mdev->type->parent->dev)->gvt; return intel_gvt_create_vgpu(vgpu, type->conf); } diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index bde7ce620dae..75628759a3bf 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -23,12 +23,6 @@ static struct class_compat *mdev_bus_compat_class; static LIST_HEAD(mdev_list); static DEFINE_MUTEX(mdev_list_lock); -struct device *mdev_parent_dev(struct mdev_device *mdev) -{ - return mdev->type->parent->dev; -} -EXPORT_SYMBOL(mdev_parent_dev); - /* * Used in mdev_type_attribute sysfs functions to return the parent struct * device diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 6c179d2b8927..bbedffcb38d4 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -95,7 +95,6 @@ void mdev_unregister_parent(struct mdev_parent *parent); int mdev_register_driver(struct mdev_driver *drv); void mdev_unregister_driver(struct mdev_driver *drv); -struct device *mdev_parent_dev(struct mdev_device *mdev); static inline struct device *mdev_dev(struct mdev_device *mdev) { return &mdev->dev; From c7c1f38f6cba7e3249866c06639ea62755f0a24e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:47 +0200 Subject: [PATCH 66/77] vfio/mdev: remove mtype_get_parent_dev Just open code the dereferences in the only user. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Jason J. Herne Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-10-hch@lst.de Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 3 +-- drivers/vfio/mdev/mdev_core.c | 10 ---------- include/linux/mdev.h | 2 -- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index c37e712a4b06..3db6251b3114 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -62,8 +62,7 @@ static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - struct vfio_ccw_private *private = - dev_get_drvdata(mtype_get_parent_dev(mtype)); + struct vfio_ccw_private *private = dev_get_drvdata(mtype->parent->dev); return sprintf(buf, "%d\n", atomic_read(&private->avail)); } diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 75628759a3bf..93f8caf2e5f7 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -23,16 +23,6 @@ static struct class_compat *mdev_bus_compat_class; static LIST_HEAD(mdev_list); static DEFINE_MUTEX(mdev_list_lock); -/* - * Used in mdev_type_attribute sysfs functions to return the parent struct - * device - */ -struct device *mtype_get_parent_dev(struct mdev_type *mtype) -{ - return mtype->parent->dev; -} -EXPORT_SYMBOL(mtype_get_parent_dev); - /* Caller must hold parent unreg_sem read or write lock */ static void mdev_device_remove_common(struct mdev_device *mdev) { diff --git a/include/linux/mdev.h b/include/linux/mdev.h index bbedffcb38d4..e445f809ceca 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -51,8 +51,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -struct device *mtype_get_parent_dev(struct mdev_type *mtype); - /* interface for exporting mdev supported type attributes */ struct mdev_type_attribute { struct attribute attr; From 290aac5df88a83e264b3a73ec146e5e5b3c45793 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 23 Sep 2022 11:26:48 +0200 Subject: [PATCH 67/77] vfio/mdev: consolidate all the device_api sysfs into the core code Every driver just emits a static string, simply feed it through the ops and provide a standard sysfs show function. Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-11-hch@lst.de Signed-off-by: Alex Williamson --- .../driver-api/vfio-mediated-device.rst | 2 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 9 +---- drivers/s390/cio/vfio_ccw_ops.c | 9 +---- drivers/s390/crypto/vfio_ap_ops.c | 10 +----- drivers/vfio/mdev/mdev_driver.c | 4 ++- drivers/vfio/mdev/mdev_sysfs.c | 35 +++++++++++++------ include/linux/mdev.h | 7 ++-- samples/vfio-mdev/mbochs.c | 9 +---- samples/vfio-mdev/mdpy.c | 9 +---- samples/vfio-mdev/mtty.c | 10 +----- 10 files changed, 37 insertions(+), 67 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index 7b660f3fa2c9..b0c29e37f61b 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -202,7 +202,7 @@ Directories and files under the sysfs for Each Physical Device * device_api - This attribute should show which device API is being created, for example, + This attribute shows which device API is being created, for example, "vfio-pci" for a PCI device. * available_instances diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 2265dd867956..0f70886a63e9 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -123,12 +123,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, return sprintf(buf, "%u\n", type->avail_instance); } -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -151,13 +145,11 @@ static ssize_t name_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static MDEV_TYPE_ATTR_RO(device_api); static MDEV_TYPE_ATTR_RO(description); static MDEV_TYPE_ATTR_RO(name); static const struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_description.attr, &mdev_type_attr_name.attr, NULL, @@ -1550,6 +1542,7 @@ static void intel_vgpu_remove(struct mdev_device *mdev) } static struct mdev_driver intel_vgpu_mdev_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "intel_vgpu_mdev", .owner = THIS_MODULE, diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 3db6251b3114..4c7b18151922 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -51,13 +51,6 @@ static ssize_t name_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(name); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_CCW_STRING); -} -static MDEV_TYPE_ATTR_RO(device_api); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -70,7 +63,6 @@ static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -628,6 +620,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { }; struct mdev_driver vfio_ccw_mdev_driver = { + .device_api = VFIO_DEVICE_API_CCW_STRING, .driver = { .name = "vfio_ccw_mdev", .owner = THIS_MODULE, diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 24d131c502ca..d440acfbb261 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -808,17 +808,8 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_AP_STRING); -} - -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *vfio_ap_mdev_type_attrs[] = { &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1799,6 +1790,7 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { }; static struct mdev_driver vfio_ap_matrix_driver = { + .device_api = VFIO_DEVICE_API_AP_STRING, .driver = { .name = "vfio_ap_mdev", .owner = THIS_MODULE, diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 5b3c94f4fb13..60e8b9f6474e 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -55,8 +55,10 @@ struct bus_type mdev_bus_type = { **/ int mdev_register_driver(struct mdev_driver *drv) { - if (!drv->types_attrs) + if (!drv->types_attrs || !drv->device_api) return -EINVAL; + + /* initialize common driver fields */ drv->driver.bus = &mdev_bus_type; return driver_register(&drv->driver); } diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 38b4c2466ec4..60fc52ff9244 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -72,9 +72,30 @@ static ssize_t create_store(struct mdev_type *mtype, return count; } - static MDEV_TYPE_ATTR_WO(create); +static ssize_t device_api_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", mtype->parent->mdev_driver->device_api); +} +static MDEV_TYPE_ATTR_RO(device_api); + +static struct attribute *mdev_types_core_attrs[] = { + &mdev_type_attr_create.attr, + &mdev_type_attr_device_api.attr, + NULL, +}; + +static struct attribute_group mdev_type_core_group = { + .attrs = mdev_types_core_attrs, +}; + +static const struct attribute_group *mdev_type_groups[] = { + &mdev_type_core_group, + NULL, +}; + static void mdev_type_release(struct kobject *kobj) { struct mdev_type *type = to_mdev_type(kobj); @@ -85,8 +106,9 @@ static void mdev_type_release(struct kobject *kobj) } static struct kobj_type mdev_type_ktype = { - .sysfs_ops = &mdev_type_sysfs_ops, - .release = mdev_type_release, + .sysfs_ops = &mdev_type_sysfs_ops, + .release = mdev_type_release, + .default_groups = mdev_type_groups, }; static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) @@ -106,10 +128,6 @@ static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) return ret; } - ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr); - if (ret) - goto attr_create_failed; - type->devices_kobj = kobject_create_and_add("devices", &type->kobj); if (!type->devices_kobj) { ret = -ENOMEM; @@ -124,8 +142,6 @@ static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) attrs_failed: kobject_put(type->devices_kobj); attr_devices_failed: - sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); -attr_create_failed: kobject_del(&type->kobj); kobject_put(&type->kobj); return ret; @@ -136,7 +152,6 @@ static void mdev_type_remove(struct mdev_type *type) sysfs_remove_files(&type->kobj, type->parent->mdev_driver->types_attrs); kobject_put(type->devices_kobj); - sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); kobject_del(&type->kobj); kobject_put(&type->kobj); } diff --git a/include/linux/mdev.h b/include/linux/mdev.h index e445f809ceca..af1ff0165b8d 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -61,11 +61,6 @@ struct mdev_type_attribute { size_t count); }; -#define MDEV_TYPE_ATTR(_name, _mode, _show, _store) \ -struct mdev_type_attribute mdev_type_attr_##_name = \ - __ATTR(_name, _mode, _show, _store) -#define MDEV_TYPE_ATTR_RW(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RW(_name) #define MDEV_TYPE_ATTR_RO(_name) \ struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) #define MDEV_TYPE_ATTR_WO(_name) \ @@ -73,12 +68,14 @@ struct mdev_type_attribute mdev_type_attr_##_name = \ /** * struct mdev_driver - Mediated device driver + * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed * @types_attrs: attributes to the type kobjects. * @driver: device driver structure **/ struct mdev_driver { + const char *device_api; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); const struct attribute * const *types_attrs; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 4d0839cb5194..a2fc13fade75 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1384,17 +1384,9 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1410,6 +1402,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { }; static struct mdev_driver mbochs_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "mbochs", .owner = THIS_MODULE, diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 4a341f4849e7..f9069ed2750f 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -689,17 +689,9 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -714,6 +706,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { }; static struct mdev_driver mdpy_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "mdpy", .owner = THIS_MODULE, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 814a7f98738a..064e71b28dd1 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1277,17 +1277,8 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); -static ssize_t device_api_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_DEVICE_API_PCI_STRING); -} - -static MDEV_TYPE_ATTR_RO(device_api); - static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_name.attr, - &mdev_type_attr_device_api.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1302,6 +1293,7 @@ static const struct vfio_device_ops mtty_dev_ops = { }; static struct mdev_driver mtty_driver = { + .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { .name = "mtty", .owner = THIS_MODULE, From 0bc79069ccbdbe26492493dd0c4e38b7cadf8ad5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:49 +0200 Subject: [PATCH 68/77] vfio/mdev: consolidate all the name sysfs into the core code Every driver just emits a static string, simply add a field to the mdev_type for the driver to fill out or fall back to the sysfs name and provide a standard sysfs show function. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-12-hch@lst.de Signed-off-by: Alex Williamson --- .../driver-api/vfio-mediated-device.rst | 2 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 8 ------- drivers/s390/cio/vfio_ccw_drv.c | 1 + drivers/s390/cio/vfio_ccw_ops.c | 8 ------- drivers/s390/crypto/vfio_ap_ops.c | 10 +-------- drivers/vfio/mdev/mdev_sysfs.c | 10 +++++++++ include/linux/mdev.h | 1 + samples/vfio-mdev/mbochs.c | 20 ++++-------------- samples/vfio-mdev/mdpy.c | 21 +++++-------------- samples/vfio-mdev/mtty.c | 18 ++++------------ 10 files changed, 27 insertions(+), 72 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index b0c29e37f61b..dcd1231a6fa8 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -217,7 +217,7 @@ Directories and files under the sysfs for Each Physical Device * name - This attribute should show human readable name. This is optional attribute. + This attribute shows a human readable name. * description diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 0f70886a63e9..93a52ae26f68 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -138,20 +138,12 @@ static ssize_t description_show(struct mdev_type *mtype, type->conf->weight); } -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", mtype->sysfs_name); -} - static MDEV_TYPE_ATTR_RO(available_instances); static MDEV_TYPE_ATTR_RO(description); -static MDEV_TYPE_ATTR_RO(name); static const struct attribute *gvt_type_attrs[] = { &mdev_type_attr_available_instances.attr, &mdev_type_attr_description.attr, - &mdev_type_attr_name.attr, NULL, }; diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 25a5de08b390..e5f21c725326 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -221,6 +221,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) dev_set_drvdata(&sch->dev, private); private->mdev_type.sysfs_name = "io"; + private->mdev_type.pretty_name = "I/O subchannel (Non-QDIO)"; private->mdev_types[0] = &private->mdev_type; ret = mdev_register_parent(&private->parent, &sch->dev, &vfio_ccw_mdev_driver, diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 4c7b18151922..394aab60dbd0 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -44,13 +44,6 @@ static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) vfio_ccw_mdev_reset(private); } -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "I/O subchannel (Non-QDIO)\n"); -} -static MDEV_TYPE_ATTR_RO(name); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -62,7 +55,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, NULL, }; diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index d440acfbb261..5d8dd7e837f3 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -790,14 +790,6 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) vfio_put_device(&matrix_mdev->vdev); } -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - return sprintf(buf, "%s\n", VFIO_AP_MDEV_NAME_HWVIRT); -} - -static MDEV_TYPE_ATTR_RO(name); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -809,7 +801,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *vfio_ap_mdev_type_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, NULL, }; @@ -1813,6 +1804,7 @@ int vfio_ap_mdev_register(void) return ret; matrix_dev->mdev_type.sysfs_name = VFIO_AP_MDEV_TYPE_HWVIRT; + matrix_dev->mdev_type.pretty_name = VFIO_AP_MDEV_NAME_HWVIRT; matrix_dev->mdev_types[0] = &matrix_dev->mdev_type; ret = mdev_register_parent(&matrix_dev->parent, &matrix_dev->device, &vfio_ap_matrix_driver, diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 60fc52ff9244..34583e6a97f2 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -81,9 +81,19 @@ static ssize_t device_api_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(device_api); +static ssize_t name_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", + mtype->pretty_name ? mtype->pretty_name : mtype->sysfs_name); +} + +static MDEV_TYPE_ATTR_RO(name); + static struct attribute *mdev_types_core_attrs[] = { &mdev_type_attr_create.attr, &mdev_type_attr_device_api.attr, + &mdev_type_attr_name.attr, NULL, }; diff --git a/include/linux/mdev.h b/include/linux/mdev.h index af1ff0165b8d..4bb8a58b577b 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -26,6 +26,7 @@ struct mdev_device { struct mdev_type { /* set by the driver before calling mdev_register parent: */ const char *sysfs_name; + const char *pretty_name; /* set by the core, can be used drivers */ struct mdev_parent *parent; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index a2fc13fade75..0b7585f16d8a 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -101,26 +101,25 @@ MODULE_PARM_DESC(mem, "megabytes available to " MBOCHS_NAME " devices"); static struct mbochs_type { struct mdev_type type; - const char *name; u32 mbytes; u32 max_x; u32 max_y; } mbochs_types[] = { { .type.sysfs_name = MBOCHS_TYPE_1, - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_1, .mbytes = 4, .max_x = 800, .max_y = 600, }, { .type.sysfs_name = MBOCHS_TYPE_2, - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_2, .mbytes = 16, .max_x = 1920, .max_y = 1440, }, { .type.sysfs_name = MBOCHS_TYPE_3, - .name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, + .type.pretty_name = MBOCHS_CLASS_NAME "-" MBOCHS_TYPE_3, .mbytes = 64, .max_x = 0, .max_y = 0, @@ -556,7 +555,7 @@ static int mbochs_init_dev(struct vfio_device *vdev) mbochs_reset(mdev_state); dev_info(vdev->dev, "%s: %s, %d MB, %ld pages\n", __func__, - type->name, type->mbytes, mdev_state->pagecount); + type->type.pretty_name, type->mbytes, mdev_state->pagecount); return 0; err_vconfig: @@ -1351,16 +1350,6 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - struct mbochs_type *type = - container_of(mtype, struct mbochs_type, type); - - return sprintf(buf, "%s\n", type->name); -} -static MDEV_TYPE_ATTR_RO(name); - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -1385,7 +1374,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_available_instances.attr, NULL, diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index f9069ed2750f..90c6fed200b1 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -53,7 +53,6 @@ MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); static struct mdpy_type { struct mdev_type type; - const char *name; u32 format; u32 bytepp; u32 width; @@ -61,21 +60,21 @@ static struct mdpy_type { } mdpy_types[] = { { .type.sysfs_name = MDPY_TYPE_1, - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_1, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 640, .height = 480, }, { .type.sysfs_name = MDPY_TYPE_2, - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_2, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1024, .height = 768, }, { .type.sysfs_name = MDPY_TYPE_3, - .name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, + .type.pretty_name = MDPY_CLASS_NAME "-" MDPY_TYPE_3, .format = DRM_FORMAT_XRGB8888, .bytepp = 4, .width = 1920, @@ -256,8 +255,8 @@ static int mdpy_init_dev(struct vfio_device *vdev) mdpy_create_config_space(mdev_state); mdpy_reset(mdev_state); - dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->name, type->width, - type->height); + dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->type.pretty_name, + type->width, type->height); mdpy_count++; return 0; @@ -662,15 +661,6 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); - - return sprintf(buf, "%s\n", type->name); -} -static MDEV_TYPE_ATTR_RO(name); - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -690,7 +680,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_description.attr, &mdev_type_attr_available_instances.attr, NULL, diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 064e71b28dd1..eab1b4442a96 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -146,10 +146,11 @@ struct mdev_state { static struct mtty_type { struct mdev_type type; int nr_ports; - const char *name; } mtty_types[2] = { - { .nr_ports = 1, .type.sysfs_name = "1", .name = "Single port serial" }, - { .nr_ports = 2, .type.sysfs_name = "2", .name = "Dual port serial" }, + { .nr_ports = 1, .type.sysfs_name = "1", + .type.pretty_name = "Single port serial" }, + { .nr_ports = 2, .type.sysfs_name = "2", + .type.pretty_name = "Dual port serial" }, }; static struct mdev_type *mtty_mdev_types[] = { @@ -1255,16 +1256,6 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t name_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) -{ - struct mtty_type *type = container_of(mtype, struct mtty_type, type); - - return sysfs_emit(buf, "%s\n", type->name); -} - -static MDEV_TYPE_ATTR_RO(name); - static ssize_t available_instances_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) @@ -1278,7 +1269,6 @@ static ssize_t available_instances_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, NULL, }; From f2fbc72e6da4f8e01fe5fe3d6871a791e76271c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:50 +0200 Subject: [PATCH 69/77] vfio/mdev: consolidate all the available_instance sysfs into the core code Every driver just print a number, simply add a method to the mdev_driver to return it and provide a standard sysfs show function. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-13-hch@lst.de Signed-off-by: Alex Williamson --- .../driver-api/vfio-mediated-device.rst | 3 +- drivers/gpu/drm/i915/gvt/gvt.h | 1 - drivers/gpu/drm/i915/gvt/kvmgt.c | 34 +++++++++------ drivers/gpu/drm/i915/gvt/vgpu.c | 41 ++----------------- drivers/s390/cio/vfio_ccw_ops.c | 14 ++----- drivers/s390/crypto/vfio_ap_ops.c | 16 ++------ drivers/vfio/mdev/mdev_sysfs.c | 11 +++++ include/linux/mdev.h | 2 + samples/vfio-mdev/mbochs.c | 10 ++--- samples/vfio-mdev/mdpy.c | 9 ++-- samples/vfio-mdev/mtty.c | 16 ++------ 11 files changed, 55 insertions(+), 102 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index dcd1231a6fa8..558bd7ebced8 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -103,6 +103,7 @@ structure to represent a mediated device's driver:: struct mdev_driver { int (*probe) (struct mdev_device *dev); void (*remove) (struct mdev_device *dev); + unsigned int (*get_available)(struct mdev_type *mtype); const struct attribute * const *types_attrs; struct device_driver driver; }; @@ -207,7 +208,7 @@ Directories and files under the sysfs for Each Physical Device * available_instances - This attribute should show the number of devices of type that can be + This attribute shows the number of devices of type that can be created. * [device] diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h index db182066d56c..dbf8d7470b2c 100644 --- a/drivers/gpu/drm/i915/gvt/gvt.h +++ b/drivers/gpu/drm/i915/gvt/gvt.h @@ -314,7 +314,6 @@ struct intel_vgpu_type { struct mdev_type type; char name[16]; const struct intel_vgpu_config *conf; - unsigned int avail_instance; }; struct intel_gvt { diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 93a52ae26f68..45051aedb319 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -113,16 +113,6 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node); -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) -{ - struct intel_vgpu_type *type = - container_of(mtype, struct intel_vgpu_type, type); - - return sprintf(buf, "%u\n", type->avail_instance); -} - static ssize_t description_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { @@ -138,11 +128,9 @@ static ssize_t description_show(struct mdev_type *mtype, type->conf->weight); } -static MDEV_TYPE_ATTR_RO(available_instances); static MDEV_TYPE_ATTR_RO(description); static const struct attribute *gvt_type_attrs[] = { - &mdev_type_attr_available_instances.attr, &mdev_type_attr_description.attr, NULL, }; @@ -1533,6 +1521,27 @@ static void intel_vgpu_remove(struct mdev_device *mdev) vfio_put_device(&vgpu->vfio_device); } +static unsigned int intel_vgpu_get_available(struct mdev_type *mtype) +{ + struct intel_vgpu_type *type = + container_of(mtype, struct intel_vgpu_type, type); + struct intel_gvt *gvt = kdev_to_i915(mtype->parent->dev)->gvt; + unsigned int low_gm_avail, high_gm_avail, fence_avail; + + mutex_lock(&gvt->lock); + low_gm_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE - + gvt->gm.vgpu_allocated_low_gm_size; + high_gm_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE - + gvt->gm.vgpu_allocated_high_gm_size; + fence_avail = gvt_fence_sz(gvt) - HOST_FENCE - + gvt->fence.vgpu_allocated_fence_num; + mutex_unlock(&gvt->lock); + + return min3(low_gm_avail / type->conf->low_mm, + high_gm_avail / type->conf->high_mm, + fence_avail / type->conf->fence); +} + static struct mdev_driver intel_vgpu_mdev_driver = { .device_api = VFIO_DEVICE_API_PCI_STRING, .driver = { @@ -1542,6 +1551,7 @@ static struct mdev_driver intel_vgpu_mdev_driver = { }, .probe = intel_vgpu_probe, .remove = intel_vgpu_remove, + .get_available = intel_vgpu_get_available, .types_attrs = gvt_type_attrs, }; diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index 92aaa77fecee..56c71474008a 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -129,11 +129,11 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt) sprintf(gvt->types[i].name, "GVTg_V%u_%s", GRAPHICS_VER(gvt->gt->i915) == 8 ? 4 : 5, conf->name); gvt->types[i].conf = conf; - gvt->types[i].avail_instance = min(low_avail / conf->low_mm, - high_avail / conf->high_mm); gvt_dbg_core("type[%d]: %s avail %u low %u high %u fence %u weight %u res %s\n", - i, gvt->types[i].name, gvt->types[i].avail_instance, + i, gvt->types[i].name, + min(low_avail / conf->low_mm, + high_avail / conf->high_mm), conf->low_mm, conf->high_mm, conf->fence, conf->weight, vgpu_edid_str(conf->edid)); @@ -157,36 +157,6 @@ void intel_gvt_clean_vgpu_types(struct intel_gvt *gvt) kfree(gvt->types); } -static void intel_gvt_update_vgpu_types(struct intel_gvt *gvt) -{ - int i; - unsigned int low_gm_avail, high_gm_avail, fence_avail; - unsigned int low_gm_min, high_gm_min, fence_min; - - /* Need to depend on maxium hw resource size but keep on - * static config for now. - */ - low_gm_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE - - gvt->gm.vgpu_allocated_low_gm_size; - high_gm_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE - - gvt->gm.vgpu_allocated_high_gm_size; - fence_avail = gvt_fence_sz(gvt) - HOST_FENCE - - gvt->fence.vgpu_allocated_fence_num; - - for (i = 0; i < gvt->num_types; i++) { - low_gm_min = low_gm_avail / gvt->types[i].conf->low_mm; - high_gm_min = high_gm_avail / gvt->types[i].conf->high_mm; - fence_min = fence_avail / gvt->types[i].conf->fence; - gvt->types[i].avail_instance = min(min(low_gm_min, high_gm_min), - fence_min); - - gvt_dbg_core("update type[%d]: %s avail %u low %u high %u fence %u\n", - i, gvt->types[i].name, - gvt->types[i].avail_instance, gvt->types[i].conf->low_mm, - gvt->types[i].conf->high_mm, gvt->types[i].conf->fence); - } -} - /** * intel_gvt_active_vgpu - activate a virtual GPU * @vgpu: virtual GPU @@ -281,10 +251,6 @@ void intel_gvt_destroy_vgpu(struct intel_vgpu *vgpu) intel_vgpu_clean_mmio(vgpu); intel_vgpu_dmabuf_cleanup(vgpu); mutex_unlock(&vgpu->vgpu_lock); - - mutex_lock(&gvt->lock); - intel_gvt_update_vgpu_types(gvt); - mutex_unlock(&gvt->lock); } #define IDLE_VGPU_IDR 0 @@ -414,7 +380,6 @@ int intel_gvt_create_vgpu(struct intel_vgpu *vgpu, if (ret) goto out_clean_sched_policy; - intel_gvt_update_vgpu_types(gvt); intel_gvt_update_reg_whitelist(vgpu); mutex_unlock(&gvt->lock); return 0; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 394aab60dbd0..559ca1805592 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -44,20 +44,12 @@ static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) vfio_ccw_mdev_reset(private); } -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int vfio_ccw_get_available(struct mdev_type *mtype) { struct vfio_ccw_private *private = dev_get_drvdata(mtype->parent->dev); - return sprintf(buf, "%d\n", atomic_read(&private->avail)); + return atomic_read(&private->avail); } -static MDEV_TYPE_ATTR_RO(available_instances); - -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_available_instances.attr, - NULL, -}; static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) { @@ -620,5 +612,5 @@ struct mdev_driver vfio_ccw_mdev_driver = { }, .probe = vfio_ccw_mdev_probe, .remove = vfio_ccw_mdev_remove, - .types_attrs = mdev_types_attrs, + .get_available = vfio_ccw_get_available, }; diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 5d8dd7e837f3..8606f5d75188 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -790,21 +790,11 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) vfio_put_device(&matrix_mdev->vdev); } -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int vfio_ap_mdev_get_available(struct mdev_type *mtype) { - return sprintf(buf, "%d\n", - atomic_read(&matrix_dev->available_instances)); + return atomic_read(&matrix_dev->available_instances); } -static MDEV_TYPE_ATTR_RO(available_instances); - -static const struct attribute *vfio_ap_mdev_type_attrs[] = { - &mdev_type_attr_available_instances.attr, - NULL, -}; - #define MDEV_SHARING_ERR "Userspace may not re-assign queue %02lx.%04lx " \ "already assigned to %s" @@ -1790,7 +1780,7 @@ static struct mdev_driver vfio_ap_matrix_driver = { }, .probe = vfio_ap_mdev_probe, .remove = vfio_ap_mdev_remove, - .types_attrs = vfio_ap_mdev_type_attrs, + .get_available = vfio_ap_mdev_get_available, }; int vfio_ap_mdev_register(void) diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 34583e6a97f2..b7f87c3eda5e 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -90,10 +90,21 @@ static ssize_t name_show(struct mdev_type *mtype, static MDEV_TYPE_ATTR_RO(name); +static ssize_t available_instances_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, + char *buf) +{ + struct mdev_driver *drv = mtype->parent->mdev_driver; + + return sysfs_emit(buf, "%u\n", drv->get_available(mtype)); +} +static MDEV_TYPE_ATTR_RO(available_instances); + static struct attribute *mdev_types_core_attrs[] = { &mdev_type_attr_create.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_name.attr, + &mdev_type_attr_available_instances.attr, NULL, }; diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 4bb8a58b577b..d39e08a1824c 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -72,6 +72,7 @@ struct mdev_type_attribute { * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed + * @get_available: Return the max number of instances that can be created * @types_attrs: attributes to the type kobjects. * @driver: device driver structure **/ @@ -79,6 +80,7 @@ struct mdev_driver { const char *device_api; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); + unsigned int (*get_available)(struct mdev_type *mtype); const struct attribute * const *types_attrs; struct device_driver driver; }; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 0b7585f16d8a..6c2cbc4e25ca 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1361,21 +1361,16 @@ static ssize_t description_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(description); -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int mbochs_get_available(struct mdev_type *mtype) { struct mbochs_type *type = container_of(mtype, struct mbochs_type, type); - int count = atomic_read(&mbochs_avail_mbytes) / type->mbytes; - return sprintf(buf, "%d\n", count); + return atomic_read(&mbochs_avail_mbytes) / type->mbytes; } -static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_description.attr, - &mdev_type_attr_available_instances.attr, NULL, }; @@ -1399,6 +1394,7 @@ static struct mdev_driver mbochs_driver = { }, .probe = mbochs_probe, .remove = mbochs_remove, + .get_available = mbochs_get_available, .types_attrs = mdev_types_attrs, }; diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 90c6fed200b1..d1c835c9cabf 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -671,17 +671,13 @@ static ssize_t description_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(description); -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int mdpy_get_available(struct mdev_type *mtype) { - return sprintf(buf, "%d\n", max_devices - mdpy_count); + return max_devices - mdpy_count; } -static MDEV_TYPE_ATTR_RO(available_instances); static const struct attribute *mdev_types_attrs[] = { &mdev_type_attr_description.attr, - &mdev_type_attr_available_instances.attr, NULL, }; @@ -704,6 +700,7 @@ static struct mdev_driver mdpy_driver = { }, .probe = mdpy_probe, .remove = mdpy_remove, + .get_available = mdpy_get_available, .types_attrs = mdev_types_attrs, }; diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index eab1b4442a96..e72085fc1376 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1256,23 +1256,13 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t available_instances_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, - char *buf) +static unsigned int mtty_get_available(struct mdev_type *mtype) { struct mtty_type *type = container_of(mtype, struct mtty_type, type); - return sprintf(buf, "%d\n", atomic_read(&mdev_avail_ports) / - type->nr_ports); + return atomic_read(&mdev_avail_ports) / type->nr_ports; } -static MDEV_TYPE_ATTR_RO(available_instances); - -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_available_instances.attr, - NULL, -}; - static const struct vfio_device_ops mtty_dev_ops = { .name = "vfio-mtty", .init = mtty_init_dev, @@ -1292,7 +1282,7 @@ static struct mdev_driver mtty_driver = { }, .probe = mtty_probe, .remove = mtty_remove, - .types_attrs = mdev_types_attrs, + .get_available = mtty_get_available, }; static void mtty_device_release(struct device *dev) From 685a1537f4c603cfcaf4b9be56ff6a571f7ddd08 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Sep 2022 11:26:51 +0200 Subject: [PATCH 70/77] vfio/mdev: consolidate all the description sysfs into the core code Every driver just emits a string, simply add a method to the mdev_driver to return it and provide a standard sysfs show function. Remove the now unused types_attrs field in struct mdev_driver and the support code for it. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Link: https://lore.kernel.org/r/20220923092652.100656-14-hch@lst.de Signed-off-by: Alex Williamson --- .../driver-api/vfio-mediated-device.rst | 4 +- drivers/gpu/drm/i915/gvt/kvmgt.c | 18 +++------ drivers/vfio/mdev/mdev_driver.c | 2 +- drivers/vfio/mdev/mdev_sysfs.c | 40 +++++++++++++++---- include/linux/mdev.h | 19 +-------- samples/vfio-mdev/mbochs.c | 11 +---- samples/vfio-mdev/mdpy.c | 11 +---- 7 files changed, 46 insertions(+), 59 deletions(-) diff --git a/Documentation/driver-api/vfio-mediated-device.rst b/Documentation/driver-api/vfio-mediated-device.rst index 558bd7ebced8..fdf7d69378ec 100644 --- a/Documentation/driver-api/vfio-mediated-device.rst +++ b/Documentation/driver-api/vfio-mediated-device.rst @@ -104,7 +104,7 @@ structure to represent a mediated device's driver:: int (*probe) (struct mdev_device *dev); void (*remove) (struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); - const struct attribute * const *types_attrs; + ssize_t (*show_description)(struct mdev_type *mtype, char *buf); struct device_driver driver; }; @@ -222,7 +222,7 @@ Directories and files under the sysfs for Each Physical Device * description - This attribute should show brief features/description of the type. This is + This attribute can show brief features/description of the type. This is an optional attribute. Directories and Files Under the sysfs for Each mdev Device diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 45051aedb319..7a45e5360caf 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -113,8 +113,7 @@ static void kvmgt_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot, struct kvm_page_track_notifier_node *node); -static ssize_t description_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) +static ssize_t intel_vgpu_show_description(struct mdev_type *mtype, char *buf) { struct intel_vgpu_type *type = container_of(mtype, struct intel_vgpu_type, type); @@ -128,13 +127,6 @@ static ssize_t description_show(struct mdev_type *mtype, type->conf->weight); } -static MDEV_TYPE_ATTR_RO(description); - -static const struct attribute *gvt_type_attrs[] = { - &mdev_type_attr_description.attr, - NULL, -}; - static void gvt_unpin_guest_page(struct intel_vgpu *vgpu, unsigned long gfn, unsigned long size) { @@ -1549,10 +1541,10 @@ static struct mdev_driver intel_vgpu_mdev_driver = { .owner = THIS_MODULE, .dev_groups = intel_vgpu_groups, }, - .probe = intel_vgpu_probe, - .remove = intel_vgpu_remove, - .get_available = intel_vgpu_get_available, - .types_attrs = gvt_type_attrs, + .probe = intel_vgpu_probe, + .remove = intel_vgpu_remove, + .get_available = intel_vgpu_get_available, + .show_description = intel_vgpu_show_description, }; int intel_gvt_page_track_add(struct intel_vgpu *info, u64 gfn) diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index 60e8b9f6474e..7825d83a55f8 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -55,7 +55,7 @@ struct bus_type mdev_bus_type = { **/ int mdev_register_driver(struct mdev_driver *drv) { - if (!drv->types_attrs || !drv->device_api) + if (!drv->device_api) return -EINVAL; /* initialize common driver fields */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index b7f87c3eda5e..658b3bf5ed0b 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -14,7 +14,19 @@ #include "mdev_private.h" -/* Static functions */ +struct mdev_type_attribute { + struct attribute attr; + ssize_t (*show)(struct mdev_type *mtype, + struct mdev_type_attribute *attr, char *buf); + ssize_t (*store)(struct mdev_type *mtype, + struct mdev_type_attribute *attr, const char *buf, + size_t count); +}; + +#define MDEV_TYPE_ATTR_RO(_name) \ + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) +#define MDEV_TYPE_ATTR_WO(_name) \ + struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name) static ssize_t mdev_type_attr_show(struct kobject *kobj, struct attribute *__attr, char *buf) @@ -100,16 +112,35 @@ static ssize_t available_instances_show(struct mdev_type *mtype, } static MDEV_TYPE_ATTR_RO(available_instances); +static ssize_t description_show(struct mdev_type *mtype, + struct mdev_type_attribute *attr, + char *buf) +{ + return mtype->parent->mdev_driver->show_description(mtype, buf); +} +static MDEV_TYPE_ATTR_RO(description); + static struct attribute *mdev_types_core_attrs[] = { &mdev_type_attr_create.attr, &mdev_type_attr_device_api.attr, &mdev_type_attr_name.attr, &mdev_type_attr_available_instances.attr, + &mdev_type_attr_description.attr, NULL, }; +static umode_t mdev_types_core_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + if (attr == &mdev_type_attr_description.attr && + !to_mdev_type(kobj)->parent->mdev_driver->show_description) + return 0; + return attr->mode; +} + static struct attribute_group mdev_type_core_group = { .attrs = mdev_types_core_attrs, + .is_visible = mdev_types_core_is_visible, }; static const struct attribute_group *mdev_type_groups[] = { @@ -155,13 +186,8 @@ static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) goto attr_devices_failed; } - ret = sysfs_create_files(&type->kobj, parent->mdev_driver->types_attrs); - if (ret) - goto attrs_failed; return 0; -attrs_failed: - kobject_put(type->devices_kobj); attr_devices_failed: kobject_del(&type->kobj); kobject_put(&type->kobj); @@ -170,8 +196,6 @@ static int mdev_type_add(struct mdev_parent *parent, struct mdev_type *type) static void mdev_type_remove(struct mdev_type *type) { - sysfs_remove_files(&type->kobj, type->parent->mdev_driver->types_attrs); - kobject_put(type->devices_kobj); kobject_del(&type->kobj); kobject_put(&type->kobj); diff --git a/include/linux/mdev.h b/include/linux/mdev.h index d39e08a1824c..33674cb5ed5d 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -52,28 +52,13 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -/* interface for exporting mdev supported type attributes */ -struct mdev_type_attribute { - struct attribute attr; - ssize_t (*show)(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf); - ssize_t (*store)(struct mdev_type *mtype, - struct mdev_type_attribute *attr, const char *buf, - size_t count); -}; - -#define MDEV_TYPE_ATTR_RO(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_RO(_name) -#define MDEV_TYPE_ATTR_WO(_name) \ - struct mdev_type_attribute mdev_type_attr_##_name = __ATTR_WO(_name) - /** * struct mdev_driver - Mediated device driver * @device_api: string to return for the device_api sysfs * @probe: called when new device created * @remove: called when device removed * @get_available: Return the max number of instances that can be created - * @types_attrs: attributes to the type kobjects. + * @show_description: Print a description of the mtype * @driver: device driver structure **/ struct mdev_driver { @@ -81,7 +66,7 @@ struct mdev_driver { int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); - const struct attribute * const *types_attrs; + ssize_t (*show_description)(struct mdev_type *mtype, char *buf); struct device_driver driver; }; diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index 6c2cbc4e25ca..117a8d799f71 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1350,8 +1350,7 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t description_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) +static ssize_t mbochs_show_description(struct mdev_type *mtype, char *buf) { struct mbochs_type *type = container_of(mtype, struct mbochs_type, type); @@ -1359,7 +1358,6 @@ static ssize_t description_show(struct mdev_type *mtype, return sprintf(buf, "virtual display, %d MB video memory\n", type ? type->mbytes : 0); } -static MDEV_TYPE_ATTR_RO(description); static unsigned int mbochs_get_available(struct mdev_type *mtype) { @@ -1369,11 +1367,6 @@ static unsigned int mbochs_get_available(struct mdev_type *mtype) return atomic_read(&mbochs_avail_mbytes) / type->mbytes; } -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_description.attr, - NULL, -}; - static const struct vfio_device_ops mbochs_dev_ops = { .close_device = mbochs_close_device, .init = mbochs_init_dev, @@ -1395,7 +1388,7 @@ static struct mdev_driver mbochs_driver = { .probe = mbochs_probe, .remove = mbochs_remove, .get_available = mbochs_get_available, - .types_attrs = mdev_types_attrs, + .show_description = mbochs_show_description, }; static const struct file_operations vd_fops = { diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index d1c835c9cabf..a7cf59246ddd 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -661,26 +661,19 @@ static const struct attribute_group *mdev_dev_groups[] = { NULL, }; -static ssize_t description_show(struct mdev_type *mtype, - struct mdev_type_attribute *attr, char *buf) +static ssize_t mdpy_show_description(struct mdev_type *mtype, char *buf) { struct mdpy_type *type = container_of(mtype, struct mdpy_type, type); return sprintf(buf, "virtual display, %dx%d framebuffer\n", type->width, type->height); } -static MDEV_TYPE_ATTR_RO(description); static unsigned int mdpy_get_available(struct mdev_type *mtype) { return max_devices - mdpy_count; } -static const struct attribute *mdev_types_attrs[] = { - &mdev_type_attr_description.attr, - NULL, -}; - static const struct vfio_device_ops mdpy_dev_ops = { .init = mdpy_init_dev, .release = mdpy_release_dev, @@ -701,7 +694,7 @@ static struct mdev_driver mdpy_driver = { .probe = mdpy_probe, .remove = mdpy_remove, .get_available = mdpy_get_available, - .types_attrs = mdev_types_attrs, + .show_description = mdpy_show_description, }; static const struct file_operations vd_fops = { From 9c799c224d6ebc5be51065bd3217a2d7eea23b8f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 23 Sep 2022 11:26:52 +0200 Subject: [PATCH 71/77] vfio/mdev: add mdev available instance checking to the core Many of the mdev drivers use a simple counter for keeping track of the available instances. Move this code to the core code and store the counter in the mdev_parent. Implement it using correct locking, fixing mdpy. Drivers just provide the value in the mdev_driver at registration time and the core code takes care of maintaining it and exposing the value in sysfs. [hch: count instances per-parent instead of per-type, use an atomic_t to avoid taking mdev_list_lock in the show method] Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Kevin Tian Reviewed-by: Kirti Wankhede Reviewed-by: Eric Farman Link: https://lore.kernel.org/r/20220923092652.100656-15-hch@lst.de Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 1 - drivers/s390/cio/vfio_ccw_ops.c | 15 +-------------- drivers/s390/cio/vfio_ccw_private.h | 2 -- drivers/s390/crypto/vfio_ap_ops.c | 13 +------------ drivers/s390/crypto/vfio_ap_private.h | 2 -- drivers/vfio/mdev/mdev_core.c | 22 +++++++++++++++++++--- drivers/vfio/mdev/mdev_sysfs.c | 5 ++++- include/linux/mdev.h | 3 +++ samples/vfio-mdev/mdpy.c | 22 ++++------------------ 9 files changed, 32 insertions(+), 53 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index e5f21c725326..7f5402fe857a 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -141,7 +141,6 @@ static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) INIT_LIST_HEAD(&private->crw); INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); - atomic_set(&private->avail, 1); private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), GFP_KERNEL); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 559ca1805592..6ae4d012d800 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -44,13 +44,6 @@ static void vfio_ccw_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) vfio_ccw_mdev_reset(private); } -static unsigned int vfio_ccw_get_available(struct mdev_type *mtype) -{ - struct vfio_ccw_private *private = dev_get_drvdata(mtype->parent->dev); - - return atomic_read(&private->avail); -} - static int vfio_ccw_mdev_init_dev(struct vfio_device *vdev) { struct vfio_ccw_private *private = @@ -68,9 +61,6 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) if (private->state == VFIO_CCW_STATE_NOT_OPER) return -ENODEV; - if (atomic_dec_if_positive(&private->avail) < 0) - return -EPERM; - ret = vfio_init_device(&private->vdev, &mdev->dev, &vfio_ccw_dev_ops); if (ret) return ret; @@ -88,7 +78,6 @@ static int vfio_ccw_mdev_probe(struct mdev_device *mdev) err_put_vdev: vfio_put_device(&private->vdev); - atomic_inc(&private->avail); return ret; } @@ -130,8 +119,6 @@ static void vfio_ccw_mdev_remove(struct mdev_device *mdev) * cycle. */ wait_for_completion(&private->release_comp); - - atomic_inc(&private->avail); } static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) @@ -605,6 +592,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { struct mdev_driver vfio_ccw_mdev_driver = { .device_api = VFIO_DEVICE_API_CCW_STRING, + .max_instances = 1, .driver = { .name = "vfio_ccw_mdev", .owner = THIS_MODULE, @@ -612,5 +600,4 @@ struct mdev_driver vfio_ccw_mdev_driver = { }, .probe = vfio_ccw_mdev_probe, .remove = vfio_ccw_mdev_remove, - .get_available = vfio_ccw_get_available, }; diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index 52caa721ec06..bd5fb81456af 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -73,7 +73,6 @@ struct vfio_ccw_crw { * @sch: pointer to the subchannel * @state: internal state of the device * @completion: synchronization helper of the I/O completion - * @avail: available for creating a mediated device * @io_region: MMIO region to input/output I/O arguments/results * @io_mutex: protect against concurrent update of I/O regions * @region: additional regions for other subchannel operations @@ -97,7 +96,6 @@ struct vfio_ccw_private { struct subchannel *sch; int state; struct completion *completion; - atomic_t avail; struct ccw_io_region *io_region; struct mutex io_mutex; struct vfio_ccw_region *region; diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 8606f5d75188..2884189f3877 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -689,9 +689,6 @@ static int vfio_ap_mdev_init_dev(struct vfio_device *vdev) struct ap_matrix_mdev *matrix_mdev = container_of(vdev, struct ap_matrix_mdev, vdev); - if ((atomic_dec_if_positive(&matrix_dev->available_instances) < 0)) - return -EPERM; - matrix_mdev->mdev = to_mdev_device(vdev->dev); vfio_ap_matrix_init(&matrix_dev->info, &matrix_mdev->matrix); matrix_mdev->pqap_hook = handle_pqap; @@ -770,7 +767,6 @@ static void vfio_ap_mdev_unlink_fr_queues(struct ap_matrix_mdev *matrix_mdev) static void vfio_ap_mdev_release_dev(struct vfio_device *vdev) { - atomic_inc(&matrix_dev->available_instances); vfio_free_device(vdev); } @@ -790,11 +786,6 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) vfio_put_device(&matrix_mdev->vdev); } -static unsigned int vfio_ap_mdev_get_available(struct mdev_type *mtype) -{ - return atomic_read(&matrix_dev->available_instances); -} - #define MDEV_SHARING_ERR "Userspace may not re-assign queue %02lx.%04lx " \ "already assigned to %s" @@ -1772,6 +1763,7 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { static struct mdev_driver vfio_ap_matrix_driver = { .device_api = VFIO_DEVICE_API_AP_STRING, + .max_instances = MAX_ZDEV_ENTRIES_EXT, .driver = { .name = "vfio_ap_mdev", .owner = THIS_MODULE, @@ -1780,15 +1772,12 @@ static struct mdev_driver vfio_ap_matrix_driver = { }, .probe = vfio_ap_mdev_probe, .remove = vfio_ap_mdev_remove, - .get_available = vfio_ap_mdev_get_available, }; int vfio_ap_mdev_register(void) { int ret; - atomic_set(&matrix_dev->available_instances, MAX_ZDEV_ENTRIES_EXT); - ret = mdev_register_driver(&vfio_ap_matrix_driver); if (ret) return ret; diff --git a/drivers/s390/crypto/vfio_ap_private.h b/drivers/s390/crypto/vfio_ap_private.h index 441dc8dda380..2eddd5f34ed3 100644 --- a/drivers/s390/crypto/vfio_ap_private.h +++ b/drivers/s390/crypto/vfio_ap_private.h @@ -29,7 +29,6 @@ * struct ap_matrix_dev - Contains the data for the matrix device. * * @device: generic device structure associated with the AP matrix device - * @available_instances: number of mediated matrix devices that can be created * @info: the struct containing the output from the PQAP(QCI) instruction * @mdev_list: the list of mediated matrix devices created * @mdevs_lock: mutex for locking the AP matrix device. This lock will be @@ -46,7 +45,6 @@ */ struct ap_matrix_dev { struct device device; - atomic_t available_instances; struct ap_config_info info; struct list_head mdev_list; struct mutex mdevs_lock; /* serializes access to each ap_matrix_mdev */ diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 93f8caf2e5f7..58f91b3bd670 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -70,6 +70,7 @@ int mdev_register_parent(struct mdev_parent *parent, struct device *dev, parent->mdev_driver = mdev_driver; parent->types = types; parent->nr_types = nr_types; + atomic_set(&parent->available_instances, mdev_driver->max_instances); if (!mdev_bus_compat_class) { mdev_bus_compat_class = class_compat_register("mdev_bus"); @@ -115,14 +116,17 @@ EXPORT_SYMBOL(mdev_unregister_parent); static void mdev_device_release(struct device *dev) { struct mdev_device *mdev = to_mdev_device(dev); - - /* Pairs with the get in mdev_device_create() */ - kobject_put(&mdev->type->kobj); + struct mdev_parent *parent = mdev->type->parent; mutex_lock(&mdev_list_lock); list_del(&mdev->next); + if (!parent->mdev_driver->get_available) + atomic_inc(&parent->available_instances); mutex_unlock(&mdev_list_lock); + /* Pairs with the get in mdev_device_create() */ + kobject_put(&mdev->type->kobj); + dev_dbg(&mdev->dev, "MDEV: destroying\n"); kfree(mdev); } @@ -144,6 +148,18 @@ int mdev_device_create(struct mdev_type *type, const guid_t *uuid) } } + if (!drv->get_available) { + /* + * Note: that non-atomic read and dec is fine here because + * all modifications are under mdev_list_lock. + */ + if (!atomic_read(&parent->available_instances)) { + mutex_unlock(&mdev_list_lock); + return -EUSERS; + } + atomic_dec(&parent->available_instances); + } + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) { mutex_unlock(&mdev_list_lock); diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 658b3bf5ed0b..abe3359dd477 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -108,7 +108,10 @@ static ssize_t available_instances_show(struct mdev_type *mtype, { struct mdev_driver *drv = mtype->parent->mdev_driver; - return sysfs_emit(buf, "%u\n", drv->get_available(mtype)); + if (drv->get_available) + return sysfs_emit(buf, "%u\n", drv->get_available(mtype)); + return sysfs_emit(buf, "%u\n", + atomic_read(&mtype->parent->available_instances)); } static MDEV_TYPE_ATTR_RO(available_instances); diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 33674cb5ed5d..139d05b26f82 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -45,6 +45,7 @@ struct mdev_parent { struct rw_semaphore unreg_sem; struct mdev_type **types; unsigned int nr_types; + atomic_t available_instances; }; static inline struct mdev_device *to_mdev_device(struct device *dev) @@ -55,6 +56,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) /** * struct mdev_driver - Mediated device driver * @device_api: string to return for the device_api sysfs + * @max_instances: maximum number of instances supported (optional) * @probe: called when new device created * @remove: called when device removed * @get_available: Return the max number of instances that can be created @@ -63,6 +65,7 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) **/ struct mdev_driver { const char *device_api; + unsigned int max_instances; int (*probe)(struct mdev_device *dev); void (*remove)(struct mdev_device *dev); unsigned int (*get_available)(struct mdev_type *mtype); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index a7cf59246ddd..946e8cfde6fd 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -42,11 +42,6 @@ MODULE_LICENSE("GPL v2"); -static int max_devices = 4; -module_param_named(count, max_devices, int, 0444); -MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); - - #define MDPY_TYPE_1 "vga" #define MDPY_TYPE_2 "xga" #define MDPY_TYPE_3 "hd" @@ -93,7 +88,6 @@ static struct class *mdpy_class; static struct cdev mdpy_cdev; static struct device mdpy_dev; static struct mdev_parent mdpy_parent; -static u32 mdpy_count; static const struct vfio_device_ops mdpy_dev_ops; /* State of each mdev device */ @@ -235,9 +229,6 @@ static int mdpy_init_dev(struct vfio_device *vdev) u32 fbsize; int ret = -ENOMEM; - if (mdpy_count >= max_devices) - return ret; - mdev_state->vconfig = kzalloc(MDPY_CONFIG_SPACE_SIZE, GFP_KERNEL); if (!mdev_state->vconfig) return ret; @@ -257,8 +248,6 @@ static int mdpy_init_dev(struct vfio_device *vdev) dev_info(vdev->dev, "%s: %s (%dx%d)\n", __func__, type->type.pretty_name, type->width, type->height); - - mdpy_count++; return 0; out_vconfig: @@ -292,7 +281,6 @@ static void mdpy_release_dev(struct vfio_device *vdev) struct mdev_state *mdev_state = container_of(vdev, struct mdev_state, vdev); - mdpy_count--; vfree(mdev_state->memblk); kfree(mdev_state->vconfig); vfio_free_device(vdev); @@ -669,11 +657,6 @@ static ssize_t mdpy_show_description(struct mdev_type *mtype, char *buf) type->width, type->height); } -static unsigned int mdpy_get_available(struct mdev_type *mtype) -{ - return max_devices - mdpy_count; -} - static const struct vfio_device_ops mdpy_dev_ops = { .init = mdpy_init_dev, .release = mdpy_release_dev, @@ -685,6 +668,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { static struct mdev_driver mdpy_driver = { .device_api = VFIO_DEVICE_API_PCI_STRING, + .max_instances = 4, .driver = { .name = "mdpy", .owner = THIS_MODULE, @@ -693,7 +677,6 @@ static struct mdev_driver mdpy_driver = { }, .probe = mdpy_probe, .remove = mdpy_remove, - .get_available = mdpy_get_available, .show_description = mdpy_show_description, }; @@ -770,5 +753,8 @@ static void __exit mdpy_dev_exit(void) mdpy_class = NULL; } +module_param_named(count, mdpy_driver.max_instances, int, 0444); +MODULE_PARM_DESC(count, "number of " MDPY_NAME " devices"); + module_init(mdpy_dev_init) module_exit(mdpy_dev_exit) From 912b74d26c7df2da1e261f3dac8942c8cbb76a49 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 29 Sep 2022 11:59:24 -0300 Subject: [PATCH 72/77] vfio: Remove the vfio_group->users and users_comp Kevin points out that the users is really just tracking if group->opened_file is set, so we can simplify this code to a wait_queue that looks for !opened_file under the group_rwsem. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/1-v1-917e3647f123+b1a-vfio_group_users_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.h | 3 +-- drivers/vfio/vfio_main.c | 45 +++++++++++++++++----------------------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 039e3208d286..78b362a92501 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -48,8 +48,6 @@ struct vfio_group { * reaches 0 then the iommu_group is invalid. */ refcount_t drivers; - refcount_t users; - struct completion users_comp; unsigned int container_users; struct iommu_group *iommu_group; struct vfio_container *container; @@ -61,6 +59,7 @@ struct vfio_group { struct rw_semaphore group_rwsem; struct kvm *kvm; struct file *opened_file; + struct swait_queue_head opened_file_wait; struct blocking_notifier_head notifier; }; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f19171cad9a2..57a7576a96a6 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -186,10 +186,9 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, cdev_init(&group->cdev, &vfio_group_fops); group->cdev.owner = THIS_MODULE; - refcount_set(&group->users, 1); refcount_set(&group->drivers, 1); - init_completion(&group->users_comp); init_rwsem(&group->group_rwsem); + init_swait_queue_head(&group->opened_file_wait); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -245,12 +244,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, return ret; } -static void vfio_group_put(struct vfio_group *group) -{ - if (refcount_dec_and_test(&group->users)) - complete(&group->users_comp); -} - static void vfio_device_remove_group(struct vfio_device *device) { struct vfio_group *group = device->group; @@ -270,10 +263,6 @@ static void vfio_device_remove_group(struct vfio_device *device) * cdev_device_add() will fail due to the name aready existing. */ cdev_device_del(&group->cdev, &group->dev); - mutex_unlock(&vfio.group_lock); - - /* Matches the get from vfio_group_alloc() */ - vfio_group_put(group); /* * Before we allow the last driver in the group to be unplugged the @@ -281,7 +270,13 @@ static void vfio_device_remove_group(struct vfio_device *device) * is because the group->iommu_group pointer should only be used so long * as a device driver is attached to a device in the group. */ - wait_for_completion(&group->users_comp); + while (group->opened_file) { + mutex_unlock(&vfio.group_lock); + swait_event_idle_exclusive(group->opened_file_wait, + !group->opened_file); + mutex_lock(&vfio.group_lock); + } + mutex_unlock(&vfio.group_lock); /* * These data structures all have paired operations that can only be @@ -906,15 +901,18 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) down_write(&group->group_rwsem); - /* users can be zero if this races with vfio_device_remove_group() */ - if (!refcount_inc_not_zero(&group->users)) { + /* + * drivers can be zero if this races with vfio_device_remove_group(), it + * will be stable at 0 under the group rwsem + */ + if (refcount_read(&group->drivers) == 0) { ret = -ENODEV; - goto err_unlock; + goto out_unlock; } if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { ret = -EPERM; - goto err_put; + goto out_unlock; } /* @@ -922,16 +920,12 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) */ if (group->opened_file) { ret = -EBUSY; - goto err_put; + goto out_unlock; } group->opened_file = filep; filep->private_data = group; - - up_write(&group->group_rwsem); - return 0; -err_put: - vfio_group_put(group); -err_unlock: + ret = 0; +out_unlock: up_write(&group->group_rwsem); return ret; } @@ -952,8 +946,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) vfio_group_detach_container(group); group->opened_file = NULL; up_write(&group->group_rwsem); - - vfio_group_put(group); + swake_up_one(&group->opened_file_wait); return 0; } From c82e81ab2569559ad873b3061217c2f37560682b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 29 Sep 2022 11:59:25 -0300 Subject: [PATCH 73/77] vfio: Change vfio_group->group_rwsem to a mutex These days not much is using the read side: - device first open - ioctl_get_status - device FD release - check enforced_coherent None of this is performance, so just make it into a normal mutex. Signed-off-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/2-v1-917e3647f123+b1a-vfio_group_users_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/container.c | 10 ++++----- drivers/vfio/vfio.h | 2 +- drivers/vfio/vfio_main.c | 47 ++++++++++++++++++++-------------------- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index db7c071ee3de..d74164abbf40 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -430,7 +430,7 @@ int vfio_container_attach_group(struct vfio_container *container, struct vfio_iommu_driver *driver; int ret = 0; - lockdep_assert_held_write(&group->group_rwsem); + lockdep_assert_held(&group->group_lock); if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; @@ -481,7 +481,7 @@ void vfio_group_detach_container(struct vfio_group *group) struct vfio_container *container = group->container; struct vfio_iommu_driver *driver; - lockdep_assert_held_write(&group->group_rwsem); + lockdep_assert_held(&group->group_lock); WARN_ON(group->container_users != 1); down_write(&container->group_lock); @@ -515,7 +515,7 @@ int vfio_device_assign_container(struct vfio_device *device) { struct vfio_group *group = device->group; - lockdep_assert_held_write(&group->group_rwsem); + lockdep_assert_held(&group->group_lock); if (!group->container || !group->container->iommu_driver || WARN_ON(!group->container_users)) @@ -531,11 +531,11 @@ int vfio_device_assign_container(struct vfio_device *device) void vfio_device_unassign_container(struct vfio_device *device) { - down_write(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); WARN_ON(device->group->container_users <= 1); device->group->container_users--; fput(device->group->opened_file); - up_write(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); } /* diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 78b362a92501..4a1bac1359a9 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -56,7 +56,7 @@ struct vfio_group { struct list_head vfio_next; struct list_head container_next; enum vfio_group_type type; - struct rw_semaphore group_rwsem; + struct mutex group_lock; struct kvm *kvm; struct file *opened_file; struct swait_queue_head opened_file_wait; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 57a7576a96a6..9207e6c0e3cb 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -158,6 +158,7 @@ static void vfio_group_release(struct device *dev) struct vfio_group *group = container_of(dev, struct vfio_group, dev); mutex_destroy(&group->device_lock); + mutex_destroy(&group->group_lock); iommu_group_put(group->iommu_group); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); @@ -187,7 +188,7 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, group->cdev.owner = THIS_MODULE; refcount_set(&group->drivers, 1); - init_rwsem(&group->group_rwsem); + mutex_init(&group->group_lock); init_swait_queue_head(&group->opened_file_wait); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); @@ -665,7 +666,7 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) { int ret = 0; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); if (!group->container) { ret = -EINVAL; goto out_unlock; @@ -677,7 +678,7 @@ static int vfio_group_ioctl_unset_container(struct vfio_group *group) vfio_group_detach_container(group); out_unlock: - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); return ret; } @@ -696,7 +697,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, if (!f.file) return -EBADF; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); if (group->container || WARN_ON(group->container_users)) { ret = -EINVAL; goto out_unlock; @@ -709,7 +710,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, } out_unlock: - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); fdput(f); return ret; } @@ -727,9 +728,9 @@ static struct file *vfio_device_open(struct vfio_device *device) struct file *filep; int ret; - down_write(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); ret = vfio_device_assign_container(device); - up_write(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); if (ret) return ERR_PTR(ret); @@ -746,7 +747,7 @@ static struct file *vfio_device_open(struct vfio_device *device) * lock. If the device driver will use it, it must obtain a * reference and release it during close_device. */ - down_read(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); device->kvm = device->group->kvm; if (device->ops->open_device) { @@ -755,7 +756,7 @@ static struct file *vfio_device_open(struct vfio_device *device) goto err_undo_count; } vfio_device_container_register(device); - up_read(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); } mutex_unlock(&device->dev_set->lock); @@ -788,14 +789,14 @@ static struct file *vfio_device_open(struct vfio_device *device) err_close_device: mutex_lock(&device->dev_set->lock); - down_read(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); if (device->open_count == 1 && device->ops->close_device) { device->ops->close_device(device); vfio_device_container_unregister(device); } err_undo_count: - up_read(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); device->open_count--; if (device->open_count == 0 && device->kvm) device->kvm = NULL; @@ -860,13 +861,13 @@ static int vfio_group_ioctl_get_status(struct vfio_group *group, status.flags = 0; - down_read(&group->group_rwsem); + mutex_lock(&group->group_lock); if (group->container) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; else if (!iommu_group_dma_owner_claimed(group->iommu_group)) status.flags |= VFIO_GROUP_FLAGS_VIABLE; - up_read(&group->group_rwsem); + mutex_unlock(&group->group_lock); if (copy_to_user(arg, &status, minsz)) return -EFAULT; @@ -899,7 +900,7 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) container_of(inode->i_cdev, struct vfio_group, cdev); int ret; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); /* * drivers can be zero if this races with vfio_device_remove_group(), it @@ -926,7 +927,7 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) filep->private_data = group; ret = 0; out_unlock: - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); return ret; } @@ -936,7 +937,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) filep->private_data = NULL; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); /* * Device FDs hold a group file reference, therefore the group release * is only called when there are no open devices. @@ -945,7 +946,7 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) if (group->container) vfio_group_detach_container(group); group->opened_file = NULL; - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); swake_up_one(&group->opened_file_wait); return 0; @@ -1001,12 +1002,12 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); - down_read(&device->group->group_rwsem); + mutex_lock(&device->group->group_lock); if (device->open_count == 1 && device->ops->close_device) device->ops->close_device(device); vfio_device_container_unregister(device); - up_read(&device->group->group_rwsem); + mutex_unlock(&device->group->group_lock); device->open_count--; if (device->open_count == 0) device->kvm = NULL; @@ -1580,7 +1581,7 @@ bool vfio_file_enforced_coherent(struct file *file) if (file->f_op != &vfio_group_fops) return true; - down_read(&group->group_rwsem); + mutex_lock(&group->group_lock); if (group->container) { ret = vfio_container_ioctl_check_extension(group->container, VFIO_DMA_CC_IOMMU); @@ -1592,7 +1593,7 @@ bool vfio_file_enforced_coherent(struct file *file) */ ret = true; } - up_read(&group->group_rwsem); + mutex_unlock(&group->group_lock); return ret; } EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); @@ -1612,9 +1613,9 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) if (file->f_op != &vfio_group_fops) return; - down_write(&group->group_rwsem); + mutex_lock(&group->group_lock); group->kvm = kvm; - up_write(&group->group_rwsem); + mutex_unlock(&group->group_lock); } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); From 4b22ef042d6f54a6e5899555f2db71749133eca8 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Oct 2022 11:04:39 -0300 Subject: [PATCH 74/77] vfio: Add vfio_file_is_group() This replaces uses of vfio_file_iommu_group() which were only detecting if the file is a VFIO file with no interest in the actual group. The only remaning user of vfio_file_iommu_group() is in KVM for the SPAPR stuff. It passes the iommu_group into the arch code through kvm for some reason. Tested-by: Matthew Rosato Tested-by: Christian Borntraeger Tested-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 2 +- drivers/vfio/vfio_main.c | 16 +++++++++++++++- include/linux/vfio.h | 1 + virt/kvm/vfio.c | 20 ++++++++++++++++++-- 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 59a28251bb0b..badc9d828cac 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1313,7 +1313,7 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, } /* Ensure the FD is a vfio group FD.*/ - if (!vfio_file_iommu_group(file)) { + if (!vfio_file_is_group(file)) { fput(file); ret = -EINVAL; break; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9207e6c0e3cb..9f830d0a25b7 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1553,17 +1553,31 @@ static const struct file_operations vfio_device_fops = { * @file: VFIO group file * * The returned iommu_group is valid as long as a ref is held on the file. + * This function is deprecated, only the SPAPR path in kvm should call it. */ struct iommu_group *vfio_file_iommu_group(struct file *file) { struct vfio_group *group = file->private_data; - if (file->f_op != &vfio_group_fops) + if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) + return NULL; + + if (!vfio_file_is_group(file)) return NULL; return group->iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); +/** + * vfio_file_is_group - True if the file is usable with VFIO aPIS + * @file: VFIO group file + */ +bool vfio_file_is_group(struct file *file) +{ + return file->f_op == &vfio_group_fops; +} +EXPORT_SYMBOL_GPL(vfio_file_is_group); + /** * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file * is always CPU cache coherent diff --git a/include/linux/vfio.h b/include/linux/vfio.h index ee399a768070..e7cebeb875dd 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -199,6 +199,7 @@ int vfio_mig_get_next_state(struct vfio_device *device, * External user API */ struct iommu_group *vfio_file_iommu_group(struct file *file); +bool vfio_file_is_group(struct file *file); bool vfio_file_enforced_coherent(struct file *file); void vfio_file_set_kvm(struct file *file, struct kvm *kvm); bool vfio_file_has_dev(struct file *file, struct vfio_device *device); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index ce1b01d02c51..54aec3b0559c 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -61,6 +61,23 @@ static bool kvm_vfio_file_enforced_coherent(struct file *file) return ret; } +static bool kvm_vfio_file_is_group(struct file *file) +{ + bool (*fn)(struct file *file); + bool ret; + + fn = symbol_get(vfio_file_is_group); + if (!fn) + return false; + + ret = fn(file); + + symbol_put(vfio_file_is_group); + + return ret; +} + +#ifdef CONFIG_SPAPR_TCE_IOMMU static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) { struct iommu_group *(*fn)(struct file *file); @@ -77,7 +94,6 @@ static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) return ret; } -#ifdef CONFIG_SPAPR_TCE_IOMMU static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_group *kvg) { @@ -136,7 +152,7 @@ static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) return -EBADF; /* Ensure the FD is a vfio group FD.*/ - if (!kvm_vfio_file_iommu_group(filp)) { + if (!kvm_vfio_file_is_group(filp)) { ret = -EINVAL; goto err_fput; } From 819da99a7360f7e197038d12f0eba626bde11856 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Oct 2022 11:04:40 -0300 Subject: [PATCH 75/77] vfio: Hold a reference to the iommu_group in kvm for SPAPR SPAPR exists completely outside the normal iommu driver framework, the groups it creates are fake and are only created to enable VFIO's uAPI. Thus, it does not need to follow the iommu core rule that the iommu_group will only be touched while a driver is attached. Carry a group reference into KVM and have KVM directly manage the lifetime of this object independently of VFIO. This means KVM no longer relies on the vfio group file being valid to maintain the group reference. Tested-by: Matthew Rosato Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 6 ++++-- virt/kvm/vfio.c | 25 ++++++++++++++----------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9f830d0a25b7..911ee1abdff0 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1552,8 +1552,9 @@ static const struct file_operations vfio_device_fops = { * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file * @file: VFIO group file * - * The returned iommu_group is valid as long as a ref is held on the file. - * This function is deprecated, only the SPAPR path in kvm should call it. + * The returned iommu_group is valid as long as a ref is held on the file. This + * returns a reference on the group. This function is deprecated, only the SPAPR + * path in kvm should call it. */ struct iommu_group *vfio_file_iommu_group(struct file *file) { @@ -1564,6 +1565,7 @@ struct iommu_group *vfio_file_iommu_group(struct file *file) if (!vfio_file_is_group(file)) return NULL; + iommu_group_ref_get(group->iommu_group); return group->iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 54aec3b0559c..495ceabffe88 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -24,6 +24,9 @@ struct kvm_vfio_group { struct list_head node; struct file *file; +#ifdef CONFIG_SPAPR_TCE_IOMMU + struct iommu_group *iommu_group; +#endif }; struct kvm_vfio { @@ -97,12 +100,12 @@ static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, struct kvm_vfio_group *kvg) { - struct iommu_group *grp = kvm_vfio_file_iommu_group(kvg->file); - - if (WARN_ON_ONCE(!grp)) + if (WARN_ON_ONCE(!kvg->iommu_group)) return; - kvm_spapr_tce_release_iommu_group(kvm, grp); + kvm_spapr_tce_release_iommu_group(kvm, kvg->iommu_group); + iommu_group_put(kvg->iommu_group); + kvg->iommu_group = NULL; } #endif @@ -252,19 +255,19 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, mutex_lock(&kv->lock); list_for_each_entry(kvg, &kv->group_list, node) { - struct iommu_group *grp; - if (kvg->file != f.file) continue; - grp = kvm_vfio_file_iommu_group(kvg->file); - if (WARN_ON_ONCE(!grp)) { - ret = -EIO; - goto err_fdput; + if (!kvg->iommu_group) { + kvg->iommu_group = kvm_vfio_file_iommu_group(kvg->file); + if (WARN_ON_ONCE(!kvg->iommu_group)) { + ret = -EIO; + goto err_fdput; + } } ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, - grp); + kvg->iommu_group); break; } From 3dd59a7dcb97e6e40d6385a1a3faa9392b6d184a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 7 Oct 2022 11:04:41 -0300 Subject: [PATCH 76/77] vfio: Make the group FD disassociate from the iommu_group Allow the vfio_group struct to exist with a NULL iommu_group pointer. When the pointer is NULL the vfio_group users promise not to touch the iommu_group. This allows a driver to be hot unplugged while userspace is keeping the group FD open. Remove all the code waiting for the group FD to close. This fixes a userspace regression where we learned that virtnodedevd leaves a group FD open even though the /dev/ node for it has been deleted and all the drivers for it unplugged. Fixes: ca5f21b25749 ("vfio: Follow a strict lifetime for struct iommu_group") Reported-by: Christian Borntraeger Tested-by: Matthew Rosato Tested-by: Christian Borntraeger Tested-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v2-15417f29324e+1c-vfio_group_disassociate_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.h | 1 - drivers/vfio/vfio_main.c | 69 ++++++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 4a1bac1359a9..bcad54bbab08 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -59,7 +59,6 @@ struct vfio_group { struct mutex group_lock; struct kvm *kvm; struct file *opened_file; - struct swait_queue_head opened_file_wait; struct blocking_notifier_head notifier; }; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 911ee1abdff0..04099a839a52 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -133,6 +133,10 @@ __vfio_group_get_from_iommu(struct iommu_group *iommu_group) { struct vfio_group *group; + /* + * group->iommu_group from the vfio.group_list cannot be NULL + * under the vfio.group_lock. + */ list_for_each_entry(group, &vfio.group_list, vfio_next) { if (group->iommu_group == iommu_group) { refcount_inc(&group->drivers); @@ -159,7 +163,7 @@ static void vfio_group_release(struct device *dev) mutex_destroy(&group->device_lock); mutex_destroy(&group->group_lock); - iommu_group_put(group->iommu_group); + WARN_ON(group->iommu_group); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); } @@ -189,7 +193,6 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, refcount_set(&group->drivers, 1); mutex_init(&group->group_lock); - init_swait_queue_head(&group->opened_file_wait); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -248,6 +251,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, static void vfio_device_remove_group(struct vfio_device *device) { struct vfio_group *group = device->group; + struct iommu_group *iommu_group; if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); @@ -265,31 +269,29 @@ static void vfio_device_remove_group(struct vfio_device *device) */ cdev_device_del(&group->cdev, &group->dev); - /* - * Before we allow the last driver in the group to be unplugged the - * group must be sanitized so nothing else is or can reference it. This - * is because the group->iommu_group pointer should only be used so long - * as a device driver is attached to a device in the group. - */ - while (group->opened_file) { - mutex_unlock(&vfio.group_lock); - swait_event_idle_exclusive(group->opened_file_wait, - !group->opened_file); - mutex_lock(&vfio.group_lock); - } - mutex_unlock(&vfio.group_lock); - + mutex_lock(&group->group_lock); /* * These data structures all have paired operations that can only be - * undone when the caller holds a live reference on the group. Since all - * pairs must be undone these WARN_ON's indicate some caller did not + * undone when the caller holds a live reference on the device. Since + * all pairs must be undone these WARN_ON's indicate some caller did not * properly hold the group reference. */ WARN_ON(!list_empty(&group->device_list)); - WARN_ON(group->container || group->container_users); WARN_ON(group->notifier.head); - group->iommu_group = NULL; + /* + * Revoke all users of group->iommu_group. At this point we know there + * are no devices active because we are unplugging the last one. Setting + * iommu_group to NULL blocks all new users. + */ + if (group->container) + vfio_group_detach_container(group); + iommu_group = group->iommu_group; + group->iommu_group = NULL; + mutex_unlock(&group->group_lock); + mutex_unlock(&vfio.group_lock); + + iommu_group_put(iommu_group); put_device(&group->dev); } @@ -531,6 +533,10 @@ static int __vfio_register_dev(struct vfio_device *device, existing_device = vfio_group_get_device(group, device->dev); if (existing_device) { + /* + * group->iommu_group is non-NULL because we hold the drivers + * refcount. + */ dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); vfio_device_put_registration(existing_device); @@ -702,6 +708,11 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, ret = -EINVAL; goto out_unlock; } + if (!group->iommu_group) { + ret = -ENODEV; + goto out_unlock; + } + container = vfio_container_from_file(f.file); ret = -EINVAL; if (container) { @@ -862,6 +873,11 @@ static int vfio_group_ioctl_get_status(struct vfio_group *group, status.flags = 0; mutex_lock(&group->group_lock); + if (!group->iommu_group) { + mutex_unlock(&group->group_lock); + return -ENODEV; + } + if (group->container) status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | VFIO_GROUP_FLAGS_VIABLE; @@ -947,8 +963,6 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) vfio_group_detach_container(group); group->opened_file = NULL; mutex_unlock(&group->group_lock); - swake_up_one(&group->opened_file_wait); - return 0; } @@ -1559,14 +1573,21 @@ static const struct file_operations vfio_device_fops = { struct iommu_group *vfio_file_iommu_group(struct file *file) { struct vfio_group *group = file->private_data; + struct iommu_group *iommu_group = NULL; if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) return NULL; if (!vfio_file_is_group(file)) return NULL; - iommu_group_ref_get(group->iommu_group); - return group->iommu_group; + + mutex_lock(&group->group_lock); + if (group->iommu_group) { + iommu_group = group->iommu_group; + iommu_group_ref_get(iommu_group); + } + mutex_unlock(&group->group_lock); + return iommu_group; } EXPORT_SYMBOL_GPL(vfio_file_iommu_group); From b1b8132a651cf6a5b18a01d8f1bd304f5d210315 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 7 Oct 2022 12:03:00 -0600 Subject: [PATCH 77/77] vfio: More vfio_file_is_group() use cases Replace further open coded tests with helper. Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/166516896843.1215571.5378890510536477434.stgit@omen Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 04099a839a52..2d168793d4e1 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1615,7 +1615,7 @@ bool vfio_file_enforced_coherent(struct file *file) struct vfio_group *group = file->private_data; bool ret; - if (file->f_op != &vfio_group_fops) + if (!vfio_file_is_group(file)) return true; mutex_lock(&group->group_lock); @@ -1647,7 +1647,7 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) { struct vfio_group *group = file->private_data; - if (file->f_op != &vfio_group_fops) + if (!vfio_file_is_group(file)) return; mutex_lock(&group->group_lock); @@ -1667,7 +1667,7 @@ bool vfio_file_has_dev(struct file *file, struct vfio_device *device) { struct vfio_group *group = file->private_data; - if (file->f_op != &vfio_group_fops) + if (!vfio_file_is_group(file)) return false; return group == device->group;