From fec2432c9a7370788faab416b38589ac9f4350e5 Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Wed, 22 Sep 2021 14:05:29 +0300 Subject: [PATCH 01/27] bus/fsl-mc: Add generic implementation for open/reset/close commands The open/reset/close commands format is similar for all objects. Currently there are multiple implementations for these commands scattered through various drivers. The code is cavsi-identical. Create a generic implementation for the open/reset/close commands. One of the consumer will be the VFIO driver which needs to be able to reset a device. Signed-off-by: Diana Craciun Reviewed-by: Laurentiu Tudor Link: https://lore.kernel.org/r/20210922110530.24736-1-diana.craciun@oss.nxp.com Signed-off-by: Alex Williamson --- drivers/bus/fsl-mc/Makefile | 3 +- drivers/bus/fsl-mc/fsl-mc-private.h | 39 +++++++++-- drivers/bus/fsl-mc/obj-api.c | 103 ++++++++++++++++++++++++++++ include/linux/fsl/mc.h | 14 ++++ 4 files changed, 154 insertions(+), 5 deletions(-) create mode 100644 drivers/bus/fsl-mc/obj-api.c diff --git a/drivers/bus/fsl-mc/Makefile b/drivers/bus/fsl-mc/Makefile index 4ae292a30e53..892946245527 100644 --- a/drivers/bus/fsl-mc/Makefile +++ b/drivers/bus/fsl-mc/Makefile @@ -15,7 +15,8 @@ mc-bus-driver-objs := fsl-mc-bus.o \ dprc-driver.o \ fsl-mc-allocator.o \ fsl-mc-msi.o \ - dpmcp.o + dpmcp.o \ + obj-api.o # MC userspace support obj-$(CONFIG_FSL_MC_UAPI_SUPPORT) += fsl-mc-uapi.o diff --git a/drivers/bus/fsl-mc/fsl-mc-private.h b/drivers/bus/fsl-mc/fsl-mc-private.h index 1958fa065360..b3520ea1b9f4 100644 --- a/drivers/bus/fsl-mc/fsl-mc-private.h +++ b/drivers/bus/fsl-mc/fsl-mc-private.h @@ -48,7 +48,6 @@ struct dpmng_rsp_get_version { /* DPMCP command IDs */ #define DPMCP_CMDID_CLOSE DPMCP_CMD(0x800) -#define DPMCP_CMDID_OPEN DPMCP_CMD(0x80b) #define DPMCP_CMDID_RESET DPMCP_CMD(0x005) struct dpmcp_cmd_open { @@ -91,7 +90,6 @@ int dpmcp_reset(struct fsl_mc_io *mc_io, /* DPRC command IDs */ #define DPRC_CMDID_CLOSE DPRC_CMD(0x800) -#define DPRC_CMDID_OPEN DPRC_CMD(0x805) #define DPRC_CMDID_GET_API_VERSION DPRC_CMD(0xa05) #define DPRC_CMDID_GET_ATTR DPRC_CMD(0x004) @@ -453,7 +451,6 @@ int dprc_get_connection(struct fsl_mc_io *mc_io, /* Command IDs */ #define DPBP_CMDID_CLOSE DPBP_CMD(0x800) -#define DPBP_CMDID_OPEN DPBP_CMD(0x804) #define DPBP_CMDID_ENABLE DPBP_CMD(0x002) #define DPBP_CMDID_DISABLE DPBP_CMD(0x003) @@ -492,7 +489,6 @@ struct dpbp_rsp_get_attributes { /* Command IDs */ #define DPCON_CMDID_CLOSE DPCON_CMD(0x800) -#define DPCON_CMDID_OPEN DPCON_CMD(0x808) #define DPCON_CMDID_ENABLE DPCON_CMD(0x002) #define DPCON_CMDID_DISABLE DPCON_CMD(0x003) @@ -524,6 +520,41 @@ struct dpcon_cmd_set_notification { __le64 user_ctx; }; +/* + * Generic FSL MC API + */ + +/* generic command versioning */ +#define OBJ_CMD_BASE_VERSION 1 +#define OBJ_CMD_ID_OFFSET 4 + +#define OBJ_CMD(id) (((id) << OBJ_CMD_ID_OFFSET) | OBJ_CMD_BASE_VERSION) + +/* open command codes */ +#define DPRTC_CMDID_OPEN OBJ_CMD(0x810) +#define DPNI_CMDID_OPEN OBJ_CMD(0x801) +#define DPSW_CMDID_OPEN OBJ_CMD(0x802) +#define DPIO_CMDID_OPEN OBJ_CMD(0x803) +#define DPBP_CMDID_OPEN OBJ_CMD(0x804) +#define DPRC_CMDID_OPEN OBJ_CMD(0x805) +#define DPDMUX_CMDID_OPEN OBJ_CMD(0x806) +#define DPCI_CMDID_OPEN OBJ_CMD(0x807) +#define DPCON_CMDID_OPEN OBJ_CMD(0x808) +#define DPSECI_CMDID_OPEN OBJ_CMD(0x809) +#define DPAIOP_CMDID_OPEN OBJ_CMD(0x80a) +#define DPMCP_CMDID_OPEN OBJ_CMD(0x80b) +#define DPMAC_CMDID_OPEN OBJ_CMD(0x80c) +#define DPDCEI_CMDID_OPEN OBJ_CMD(0x80d) +#define DPDMAI_CMDID_OPEN OBJ_CMD(0x80e) +#define DPDBG_CMDID_OPEN OBJ_CMD(0x80f) + +/* Generic object command IDs */ +#define OBJ_CMDID_CLOSE OBJ_CMD(0x800) +#define OBJ_CMDID_RESET OBJ_CMD(0x005) + +struct fsl_mc_obj_cmd_open { + __le32 obj_id; +}; /** * struct fsl_mc_resource_pool - Pool of MC resources of a given diff --git a/drivers/bus/fsl-mc/obj-api.c b/drivers/bus/fsl-mc/obj-api.c new file mode 100644 index 000000000000..06c1dd84e38d --- /dev/null +++ b/drivers/bus/fsl-mc/obj-api.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) +/* + * Copyright 2021 NXP + * + */ +#include +#include + +#include "fsl-mc-private.h" + +static int fsl_mc_get_open_cmd_id(const char *type) +{ + static const struct { + int cmd_id; + const char *type; + } dev_ids[] = { + { DPRTC_CMDID_OPEN, "dprtc" }, + { DPRC_CMDID_OPEN, "dprc" }, + { DPNI_CMDID_OPEN, "dpni" }, + { DPIO_CMDID_OPEN, "dpio" }, + { DPSW_CMDID_OPEN, "dpsw" }, + { DPBP_CMDID_OPEN, "dpbp" }, + { DPCON_CMDID_OPEN, "dpcon" }, + { DPMCP_CMDID_OPEN, "dpmcp" }, + { DPMAC_CMDID_OPEN, "dpmac" }, + { DPSECI_CMDID_OPEN, "dpseci" }, + { DPDMUX_CMDID_OPEN, "dpdmux" }, + { DPDCEI_CMDID_OPEN, "dpdcei" }, + { DPAIOP_CMDID_OPEN, "dpaiop" }, + { DPCI_CMDID_OPEN, "dpci" }, + { DPDMAI_CMDID_OPEN, "dpdmai" }, + { DPDBG_CMDID_OPEN, "dpdbg" }, + { 0, NULL } + }; + int i; + + for (i = 0; dev_ids[i].type; i++) + if (!strcmp(dev_ids[i].type, type)) + return dev_ids[i].cmd_id; + + return -1; +} + +int fsl_mc_obj_open(struct fsl_mc_io *mc_io, + u32 cmd_flags, + int obj_id, + char *obj_type, + u16 *token) +{ + struct fsl_mc_command cmd = { 0 }; + struct fsl_mc_obj_cmd_open *cmd_params; + int err = 0; + int cmd_id = fsl_mc_get_open_cmd_id(obj_type); + + if (cmd_id == -1) + return -ENODEV; + + /* prepare command */ + cmd.header = mc_encode_cmd_header(cmd_id, cmd_flags, 0); + cmd_params = (struct fsl_mc_obj_cmd_open *)cmd.params; + cmd_params->obj_id = cpu_to_le32(obj_id); + + /* send command to mc*/ + err = mc_send_command(mc_io, &cmd); + if (err) + return err; + + /* retrieve response parameters */ + *token = mc_cmd_hdr_read_token(&cmd); + + return err; +} +EXPORT_SYMBOL_GPL(fsl_mc_obj_open); + +int fsl_mc_obj_close(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token) +{ + struct fsl_mc_command cmd = { 0 }; + + /* prepare command */ + cmd.header = mc_encode_cmd_header(OBJ_CMDID_CLOSE, cmd_flags, + token); + + /* send command to mc*/ + return mc_send_command(mc_io, &cmd); +} +EXPORT_SYMBOL_GPL(fsl_mc_obj_close); + +int fsl_mc_obj_reset(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token) +{ + struct fsl_mc_command cmd = { 0 }; + + /* prepare command */ + cmd.header = mc_encode_cmd_header(OBJ_CMDID_RESET, cmd_flags, + token); + + /* send command to mc*/ + return mc_send_command(mc_io, &cmd); +} +EXPORT_SYMBOL_GPL(fsl_mc_obj_reset); diff --git a/include/linux/fsl/mc.h b/include/linux/fsl/mc.h index 30ece3ae6df7..e026f6c48b49 100644 --- a/include/linux/fsl/mc.h +++ b/include/linux/fsl/mc.h @@ -620,6 +620,20 @@ int dpcon_reset(struct fsl_mc_io *mc_io, u32 cmd_flags, u16 token); +int fsl_mc_obj_open(struct fsl_mc_io *mc_io, + u32 cmd_flags, + int obj_id, + char *obj_type, + u16 *token); + +int fsl_mc_obj_close(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token); + +int fsl_mc_obj_reset(struct fsl_mc_io *mc_io, + u32 cmd_flags, + u16 token); + /** * struct dpcon_attr - Structure representing DPCON attributes * @id: DPCON object ID From 8798a803ddf6329dc3b995775862b571db4909d2 Mon Sep 17 00:00:00 2001 From: Diana Craciun Date: Wed, 22 Sep 2021 14:05:30 +0300 Subject: [PATCH 02/27] vfio/fsl-mc: Add per device reset support Currently when a fsl-mc device is reset, the entire DPRC container is reset which is very inefficient because the devices within a container will be reset multiple times. Add support for individually resetting a device. Signed-off-by: Diana Craciun Reviewed-by: Laurentiu Tudor Link: https://lore.kernel.org/r/20210922110530.24736-2-diana.craciun@oss.nxp.com Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 45 ++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 0ead91bfa838..6d7b2d2571a2 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -65,6 +65,34 @@ static void vfio_fsl_mc_regions_cleanup(struct vfio_fsl_mc_device *vdev) kfree(vdev->regions); } +static int vfio_fsl_mc_reset_device(struct vfio_fsl_mc_device *vdev) +{ + struct fsl_mc_device *mc_dev = vdev->mc_dev; + int ret = 0; + + if (is_fsl_mc_bus_dprc(vdev->mc_dev)) { + return dprc_reset_container(mc_dev->mc_io, 0, + mc_dev->mc_handle, + mc_dev->obj_desc.id, + DPRC_RESET_OPTION_NON_RECURSIVE); + } else { + u16 token; + + ret = fsl_mc_obj_open(mc_dev->mc_io, 0, mc_dev->obj_desc.id, + mc_dev->obj_desc.type, + &token); + if (ret) + goto out; + ret = fsl_mc_obj_reset(mc_dev->mc_io, 0, token); + if (ret) { + fsl_mc_obj_close(mc_dev->mc_io, 0, token); + goto out; + } + ret = fsl_mc_obj_close(mc_dev->mc_io, 0, token); + } +out: + return ret; +} static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) { @@ -78,9 +106,7 @@ static void vfio_fsl_mc_close_device(struct vfio_device *core_vdev) vfio_fsl_mc_regions_cleanup(vdev); /* reset the device before cleaning up the interrupts */ - ret = dprc_reset_container(mc_cont->mc_io, 0, mc_cont->mc_handle, - mc_cont->obj_desc.id, - DPRC_RESET_OPTION_NON_RECURSIVE); + ret = vfio_fsl_mc_reset_device(vdev); if (WARN_ON(ret)) dev_warn(&mc_cont->dev, @@ -203,18 +229,7 @@ static long vfio_fsl_mc_ioctl(struct vfio_device *core_vdev, } case VFIO_DEVICE_RESET: { - int ret; - struct fsl_mc_device *mc_dev = vdev->mc_dev; - - /* reset is supported only for the DPRC */ - if (!is_fsl_mc_bus_dprc(mc_dev)) - return -ENOTTY; - - ret = dprc_reset_container(mc_dev->mc_io, 0, - mc_dev->mc_handle, - mc_dev->obj_desc.id, - DPRC_RESET_OPTION_NON_RECURSIVE); - return ret; + return vfio_fsl_mc_reset_device(vdev); } default: From 38a68934aa72459217986cf6b461d87f7602648a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 24 Sep 2021 17:56:51 +0200 Subject: [PATCH 03/27] vfio: Move vfio_iommu_group_get() to vfio_register_group_dev() We don't need to hold a reference to the group in the driver as well as obtain a reference to the same group as the first thing vfio_register_group_dev() does. Since the drivers never use the group move this all into the core code. Signed-off-by: Jason Gunthorpe Signed-off-by: Christoph Hellwig Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-2-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 17 ++------- drivers/vfio/pci/vfio_pci_core.c | 13 ++----- drivers/vfio/platform/vfio_platform_common.c | 13 +------ drivers/vfio/vfio.c | 36 ++++++++------------ include/linux/vfio.h | 3 -- 5 files changed, 19 insertions(+), 63 deletions(-) diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index 0ead91bfa838..9e838fed5603 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -505,22 +505,13 @@ static void vfio_fsl_uninit_device(struct vfio_fsl_mc_device *vdev) static int vfio_fsl_mc_probe(struct fsl_mc_device *mc_dev) { - struct iommu_group *group; struct vfio_fsl_mc_device *vdev; struct device *dev = &mc_dev->dev; int ret; - group = vfio_iommu_group_get(dev); - if (!group) { - dev_err(dev, "VFIO_FSL_MC: No IOMMU group\n"); - return -EINVAL; - } - vdev = kzalloc(sizeof(*vdev), GFP_KERNEL); - if (!vdev) { - ret = -ENOMEM; - goto out_group_put; - } + if (!vdev) + return -ENOMEM; vfio_init_group_dev(&vdev->vdev, dev, &vfio_fsl_mc_ops); vdev->mc_dev = mc_dev; @@ -556,8 +547,6 @@ out_device: out_uninit: vfio_uninit_group_dev(&vdev->vdev); kfree(vdev); -out_group_put: - vfio_iommu_group_put(group, dev); return ret; } @@ -574,8 +563,6 @@ static int vfio_fsl_mc_remove(struct fsl_mc_device *mc_dev) vfio_uninit_group_dev(&vdev->vdev); kfree(vdev); - vfio_iommu_group_put(mc_dev->dev.iommu_group, dev); - return 0; } diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 68198e0f2a63..43bab0ca3e68 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -1806,7 +1806,6 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_uninit_device); int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) { struct pci_dev *pdev = vdev->pdev; - struct iommu_group *group; int ret; if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) @@ -1825,10 +1824,6 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) return -EBUSY; } - group = vfio_iommu_group_get(&pdev->dev); - if (!group) - return -EINVAL; - if (pci_is_root_bus(pdev->bus)) { ret = vfio_assign_device_set(&vdev->vdev, vdev); } else if (!pci_probe_reset_slot(pdev->slot)) { @@ -1842,10 +1837,10 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) } if (ret) - goto out_group_put; + return ret; ret = vfio_pci_vf_init(vdev); if (ret) - goto out_group_put; + return ret; ret = vfio_pci_vga_init(vdev); if (ret) goto out_vf; @@ -1876,8 +1871,6 @@ out_power: vfio_pci_set_power_state(vdev, PCI_D0); out_vf: vfio_pci_vf_uninit(vdev); -out_group_put: - vfio_iommu_group_put(group, &pdev->dev); return ret; } EXPORT_SYMBOL_GPL(vfio_pci_core_register_device); @@ -1893,8 +1886,6 @@ void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev) vfio_pci_vf_uninit(vdev); vfio_pci_vga_uninit(vdev); - vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); - if (!disable_idle_d3) vfio_pci_set_power_state(vdev, PCI_D0); } diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 6af7ce7d619c..256f55b84e70 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -642,7 +642,6 @@ static int vfio_platform_of_probe(struct vfio_platform_device *vdev, int vfio_platform_probe_common(struct vfio_platform_device *vdev, struct device *dev) { - struct iommu_group *group; int ret; vfio_init_group_dev(&vdev->vdev, dev, &vfio_platform_ops); @@ -663,24 +662,15 @@ int vfio_platform_probe_common(struct vfio_platform_device *vdev, goto out_uninit; } - group = vfio_iommu_group_get(dev); - if (!group) { - dev_err(dev, "No IOMMU group for device %s\n", vdev->name); - ret = -EINVAL; - goto put_reset; - } - ret = vfio_register_group_dev(&vdev->vdev); if (ret) - goto put_iommu; + goto put_reset; mutex_init(&vdev->igate); pm_runtime_enable(dev); return 0; -put_iommu: - vfio_iommu_group_put(group, dev); put_reset: vfio_platform_put_reset(vdev); out_uninit: @@ -696,7 +686,6 @@ void vfio_platform_remove_common(struct vfio_platform_device *vdev) pm_runtime_disable(vdev->device); vfio_platform_put_reset(vdev); vfio_uninit_group_dev(&vdev->vdev); - vfio_iommu_group_put(vdev->vdev.dev->iommu_group, vdev->vdev.dev); } EXPORT_SYMBOL_GPL(vfio_platform_remove_common); diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 3c034fe14ccb..b483b61b7c22 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -169,15 +169,7 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -/* - * vfio_iommu_group_{get,put} are only intended for VFIO bus driver probe - * and remove functions, any use cases other than acquiring the first - * reference for the purpose of calling vfio_register_group_dev() or removing - * that symmetric reference after vfio_unregister_group_dev() should use the raw - * iommu_group_{get,put} functions. In particular, vfio_iommu_group_put() - * removes the device from the dummy group and cannot be nested. - */ -struct iommu_group *vfio_iommu_group_get(struct device *dev) +static struct iommu_group *vfio_iommu_group_get(struct device *dev) { struct iommu_group *group; int __maybe_unused ret; @@ -220,18 +212,6 @@ struct iommu_group *vfio_iommu_group_get(struct device *dev) return group; } -EXPORT_SYMBOL_GPL(vfio_iommu_group_get); - -void vfio_iommu_group_put(struct iommu_group *group, struct device *dev) -{ -#ifdef CONFIG_VFIO_NOIOMMU - if (iommu_group_get_iommudata(group) == &noiommu) - iommu_group_remove_device(dev); -#endif - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(vfio_iommu_group_put); #ifdef CONFIG_VFIO_NOIOMMU static void *vfio_noiommu_open(unsigned long arg) @@ -841,7 +821,7 @@ int vfio_register_group_dev(struct vfio_device *device) if (!device->dev_set) vfio_assign_device_set(device, device); - iommu_group = iommu_group_get(device->dev); + iommu_group = vfio_iommu_group_get(device->dev); if (!iommu_group) return -EINVAL; @@ -849,6 +829,10 @@ int vfio_register_group_dev(struct vfio_device *device) if (!group) { group = vfio_create_group(iommu_group); if (IS_ERR(group)) { +#ifdef CONFIG_VFIO_NOIOMMU + if (iommu_group_get_iommudata(iommu_group) == &noiommu) + iommu_group_remove_device(device->dev); +#endif iommu_group_put(iommu_group); return PTR_ERR(group); } @@ -865,6 +849,10 @@ int vfio_register_group_dev(struct vfio_device *device) dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(iommu_group)); vfio_device_put(existing_device); +#ifdef CONFIG_VFIO_NOIOMMU + if (iommu_group_get_iommudata(iommu_group) == &noiommu) + iommu_group_remove_device(device->dev); +#endif vfio_group_put(group); return -EBUSY; } @@ -1010,6 +998,10 @@ void vfio_unregister_group_dev(struct vfio_device *device) if (list_empty(&group->device_list)) wait_event(group->container_q, !group->container); +#ifdef CONFIG_VFIO_NOIOMMU + if (iommu_group_get_iommudata(group->iommu_group) == &noiommu) + iommu_group_remove_device(device->dev); +#endif /* Matches the get in vfio_register_group_dev() */ vfio_group_put(group); } diff --git a/include/linux/vfio.h b/include/linux/vfio.h index b53a9557884a..f7083c2fd0d0 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -71,9 +71,6 @@ struct vfio_device_ops { int (*match)(struct vfio_device *vdev, char *buf); }; -extern struct iommu_group *vfio_iommu_group_get(struct device *dev); -extern void vfio_iommu_group_put(struct iommu_group *group, struct device *dev); - void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_uninit_group_dev(struct vfio_device *device); From b00621603d050f77a6af9e81e32daeccfd246d6a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:52 +0200 Subject: [PATCH 04/27] vfio: factor out a vfio_iommu_driver_allowed helper Factor out a little helper to make the checks for the noiommu driver less ugly. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-3-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index b483b61b7c22..faf8e0d637bb 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -257,8 +257,23 @@ static const struct vfio_iommu_driver_ops vfio_noiommu_ops = { .attach_group = vfio_noiommu_attach_group, .detach_group = vfio_noiommu_detach_group, }; -#endif +/* + * Only noiommu containers can use vfio-noiommu and noiommu containers can only + * use vfio-noiommu. + */ +static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, + const struct vfio_iommu_driver *driver) +{ + return container->noiommu == (driver->ops == &vfio_noiommu_ops); +} +#else +static inline bool vfio_iommu_driver_allowed(struct vfio_container *container, + const struct vfio_iommu_driver *driver) +{ + return true; +} +#endif /* CONFIG_VFIO_NOIOMMU */ /** * IOMMU driver registration @@ -1034,13 +1049,10 @@ static long vfio_ioctl_check_extension(struct vfio_container *container, list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { -#ifdef CONFIG_VFIO_NOIOMMU if (!list_empty(&container->group_list) && - (container->noiommu != - (driver->ops == &vfio_noiommu_ops))) + !vfio_iommu_driver_allowed(container, + driver)) continue; -#endif - if (!try_module_get(driver->ops->owner)) continue; @@ -1112,15 +1124,8 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container, list_for_each_entry(driver, &vfio.iommu_drivers_list, vfio_next) { void *data; -#ifdef CONFIG_VFIO_NOIOMMU - /* - * Only noiommu containers can use vfio-noiommu and noiommu - * containers can only use vfio-noiommu. - */ - if (container->noiommu != (driver->ops == &vfio_noiommu_ops)) + if (!vfio_iommu_driver_allowed(container, driver)) continue; -#endif - if (!try_module_get(driver->ops->owner)) continue; From c5b4ba9730e6d607d11c3c3c9b2f04852e121423 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:53 +0200 Subject: [PATCH 05/27] vfio: remove the iommudata check in vfio_noiommu_attach_group vfio_noiommu_attach_group has two callers: 1) __vfio_container_attach_groups is called by vfio_ioctl_set_iommu, which just called vfio_iommu_driver_allowed 2) vfio_group_set_container requires already checks ->noiommu on the vfio_group, which is propagated from the iommudata in vfio_create_group so this check is entirely superflous and can be removed. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-4-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index faf8e0d637bb..8bd4b0b96b94 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -240,7 +240,7 @@ static long vfio_noiommu_ioctl(void *iommu_data, static int vfio_noiommu_attach_group(void *iommu_data, struct iommu_group *iommu_group) { - return iommu_group_get_iommudata(iommu_group) == &noiommu ? 0 : -EINVAL; + return 0; } static void vfio_noiommu_detach_group(void *iommu_data, From 1362591f15233bdc3d1fbf27e266068bd16a4e92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:54 +0200 Subject: [PATCH 06/27] vfio: factor out a vfio_group_find_or_alloc helper Factor out a helper to find or allocate the vfio_group to reduce the spagetthi code in vfio_register_group_dev a little. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-5-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 60 ++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 25 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 8bd4b0b96b94..2b2679c7126f 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -823,10 +823,39 @@ void vfio_uninit_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); +struct vfio_group *vfio_group_find_or_alloc(struct device *dev) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + + iommu_group = vfio_iommu_group_get(dev); + if (!iommu_group) + return ERR_PTR(-EINVAL); + + /* a found vfio_group already holds a reference to the iommu_group */ + group = vfio_group_get_from_iommu(iommu_group); + if (group) + goto out_put; + + /* a newly created vfio_group keeps the reference. */ + group = vfio_create_group(iommu_group); + if (IS_ERR(group)) + goto out_remove; + return group; + +out_remove: +#ifdef CONFIG_VFIO_NOIOMMU + if (iommu_group_get_iommudata(iommu_group) == &noiommu) + iommu_group_remove_device(dev); +#endif +out_put: + iommu_group_put(iommu_group); + return group; +} + int vfio_register_group_dev(struct vfio_device *device) { struct vfio_device *existing_device; - struct iommu_group *iommu_group; struct vfio_group *group; /* @@ -836,36 +865,17 @@ int vfio_register_group_dev(struct vfio_device *device) if (!device->dev_set) vfio_assign_device_set(device, device); - iommu_group = vfio_iommu_group_get(device->dev); - if (!iommu_group) - return -EINVAL; - - group = vfio_group_get_from_iommu(iommu_group); - if (!group) { - group = vfio_create_group(iommu_group); - if (IS_ERR(group)) { -#ifdef CONFIG_VFIO_NOIOMMU - if (iommu_group_get_iommudata(iommu_group) == &noiommu) - iommu_group_remove_device(device->dev); -#endif - iommu_group_put(iommu_group); - return PTR_ERR(group); - } - } else { - /* - * A found vfio_group already holds a reference to the - * iommu_group. A created vfio_group keeps the reference. - */ - iommu_group_put(iommu_group); - } + group = vfio_group_find_or_alloc(device->dev); + if (IS_ERR(group)) + return PTR_ERR(group); existing_device = vfio_group_get_device(group, device->dev); if (existing_device) { dev_WARN(device->dev, "Device already exists on group %d\n", - iommu_group_id(iommu_group)); + iommu_group_id(group->iommu_group)); vfio_device_put(existing_device); #ifdef CONFIG_VFIO_NOIOMMU - if (iommu_group_get_iommudata(iommu_group) == &noiommu) + if (iommu_group_get_iommudata(group->iommu_group) == &noiommu) iommu_group_remove_device(device->dev); #endif vfio_group_put(group); From 3af917713230c8a6c3431e60601f0c7ad72eceb6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:55 +0200 Subject: [PATCH 07/27] vfio: refactor noiommu group creation Split the actual noiommu group creation from vfio_iommu_group_get into a new helper, and open code the rest of vfio_iommu_group_get in its only caller. This creates an entirely separate and clear code path for the noiommu group creation. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-6-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 104 ++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 2b2679c7126f..b1ed156adccd 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -169,50 +169,6 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } -static struct iommu_group *vfio_iommu_group_get(struct device *dev) -{ - struct iommu_group *group; - int __maybe_unused ret; - - group = iommu_group_get(dev); - -#ifdef CONFIG_VFIO_NOIOMMU - /* - * With noiommu enabled, an IOMMU group will be created for a device - * that doesn't already have one and doesn't have an iommu_ops on their - * bus. We set iommudata simply to be able to identify these groups - * as special use and for reclamation later. - */ - if (group || !noiommu || iommu_present(dev->bus)) - return group; - - group = iommu_group_alloc(); - if (IS_ERR(group)) - return NULL; - - iommu_group_set_name(group, "vfio-noiommu"); - iommu_group_set_iommudata(group, &noiommu, NULL); - ret = iommu_group_add_device(group, dev); - if (ret) { - iommu_group_put(group); - return NULL; - } - - /* - * Where to taint? At this point we've added an IOMMU group for a - * device that is not backed by iommu_ops, therefore any iommu_ - * callback using iommu_ops can legitimately Oops. So, while we may - * be about to give a DMA capable device to a user without IOMMU - * protection, which is clearly taint-worthy, let's go ahead and do - * it here. - */ - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); -#endif - - return group; -} - #ifdef CONFIG_VFIO_NOIOMMU static void *vfio_noiommu_open(unsigned long arg) { @@ -823,12 +779,61 @@ void vfio_uninit_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); -struct vfio_group *vfio_group_find_or_alloc(struct device *dev) +#ifdef CONFIG_VFIO_NOIOMMU +static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + int ret; + + iommu_group = iommu_group_alloc(); + if (IS_ERR(iommu_group)) + return ERR_CAST(iommu_group); + + iommu_group_set_name(iommu_group, "vfio-noiommu"); + iommu_group_set_iommudata(iommu_group, &noiommu, NULL); + ret = iommu_group_add_device(iommu_group, dev); + if (ret) + goto out_put_group; + + group = vfio_create_group(iommu_group); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto out_remove_device; + } + + return group; + +out_remove_device: + iommu_group_remove_device(dev); +out_put_group: + iommu_group_put(iommu_group); + return ERR_PTR(ret); +} +#endif + +static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) { struct iommu_group *iommu_group; struct vfio_group *group; - iommu_group = vfio_iommu_group_get(dev); + iommu_group = iommu_group_get(dev); +#ifdef CONFIG_VFIO_NOIOMMU + if (!iommu_group && noiommu && !iommu_present(dev->bus)) { + /* + * With noiommu enabled, create an IOMMU group for devices that + * don't already have one and don't have an iommu_ops on their + * bus. Taint the kernel because we're about to give a DMA + * capable device to a user without IOMMU protection. + */ + group = vfio_noiommu_group_alloc(dev); + if (!IS_ERR(group)) { + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); + } + return group; + } +#endif if (!iommu_group) return ERR_PTR(-EINVAL); @@ -840,14 +845,9 @@ struct vfio_group *vfio_group_find_or_alloc(struct device *dev) /* a newly created vfio_group keeps the reference. */ group = vfio_create_group(iommu_group); if (IS_ERR(group)) - goto out_remove; + goto out_put; return group; -out_remove: -#ifdef CONFIG_VFIO_NOIOMMU - if (iommu_group_get_iommudata(iommu_group) == &noiommu) - iommu_group_remove_device(dev); -#endif out_put: iommu_group_put(iommu_group); return group; From c04ac34078a4d7c08e1218b548e0a6b197f01582 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:56 +0200 Subject: [PATCH 08/27] vfio: remove the iommudata hack for noiommu groups Just pass a noiommu argument to vfio_create_group and set up the ->noiommu flag directly. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-7-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index b1ed156adccd..23eaebd2e28c 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -335,7 +335,8 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) /** * Group objects - create, release, get, put, search */ -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, + bool noiommu) { struct vfio_group *group, *tmp; struct device *dev; @@ -354,9 +355,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) atomic_set(&group->opened, 0); init_waitqueue_head(&group->container_q); group->iommu_group = iommu_group; -#ifdef CONFIG_VFIO_NOIOMMU - group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu); -#endif + group->noiommu = noiommu; BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); group->nb.notifier_call = vfio_iommu_group_notifier; @@ -791,12 +790,11 @@ static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev) return ERR_CAST(iommu_group); iommu_group_set_name(iommu_group, "vfio-noiommu"); - iommu_group_set_iommudata(iommu_group, &noiommu, NULL); ret = iommu_group_add_device(iommu_group, dev); if (ret) goto out_put_group; - group = vfio_create_group(iommu_group); + group = vfio_create_group(iommu_group, true); if (IS_ERR(group)) { ret = PTR_ERR(group); goto out_remove_device; @@ -843,7 +841,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) goto out_put; /* a newly created vfio_group keeps the reference. */ - group = vfio_create_group(iommu_group); + group = vfio_create_group(iommu_group, false); if (IS_ERR(group)) goto out_put; return group; @@ -874,10 +872,8 @@ int vfio_register_group_dev(struct vfio_device *device) dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); vfio_device_put(existing_device); -#ifdef CONFIG_VFIO_NOIOMMU - if (iommu_group_get_iommudata(group->iommu_group) == &noiommu) + if (group->noiommu) iommu_group_remove_device(device->dev); -#endif vfio_group_put(group); return -EBUSY; } @@ -1023,10 +1019,9 @@ void vfio_unregister_group_dev(struct vfio_device *device) if (list_empty(&group->device_list)) wait_event(group->container_q, !group->container); -#ifdef CONFIG_VFIO_NOIOMMU - if (iommu_group_get_iommudata(group->iommu_group) == &noiommu) + if (group->noiommu) iommu_group_remove_device(device->dev); -#endif + /* Matches the get in vfio_register_group_dev() */ vfio_group_put(group); } From c68ea0d00ad82428154aed890ec9f793e460fa1c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:57 +0200 Subject: [PATCH 09/27] vfio: simplify iommu group allocation for mediated devices Reuse the logic in vfio_noiommu_group_alloc to allocate a fake single-device iommu group for mediated devices by factoring out a common function, and replacing the noiommu boolean field in struct vfio_group with an enum to distinguish the three different kinds of groups. Signed-off-by: Christoph Hellwig Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-8-hch@lst.de Signed-off-by: Alex Williamson --- drivers/s390/crypto/vfio_ap_ops.c | 2 +- drivers/vfio/mdev/mdev_driver.c | 45 ++------------- drivers/vfio/mdev/vfio_mdev.c | 2 +- drivers/vfio/vfio.c | 92 ++++++++++++++++++++++--------- include/linux/vfio.h | 1 + samples/vfio-mdev/mbochs.c | 2 +- samples/vfio-mdev/mdpy.c | 2 +- samples/vfio-mdev/mtty.c | 2 +- 8 files changed, 76 insertions(+), 72 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 118939a7729a..24755d1aedd5 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -351,7 +351,7 @@ static int vfio_ap_mdev_probe(struct mdev_device *mdev) list_add(&matrix_mdev->node, &matrix_dev->mdev_list); mutex_unlock(&matrix_dev->lock); - ret = vfio_register_group_dev(&matrix_mdev->vdev); + ret = vfio_register_emulated_iommu_dev(&matrix_mdev->vdev); if (ret) goto err_list; dev_set_drvdata(&mdev->dev, matrix_mdev); diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c index e2cb1ff56f6c..7927ed4f1711 100644 --- a/drivers/vfio/mdev/mdev_driver.c +++ b/drivers/vfio/mdev/mdev_driver.c @@ -13,60 +13,23 @@ #include "mdev_private.h" -static int mdev_attach_iommu(struct mdev_device *mdev) -{ - int ret; - struct iommu_group *group; - - group = iommu_group_alloc(); - if (IS_ERR(group)) - return PTR_ERR(group); - - ret = iommu_group_add_device(group, &mdev->dev); - if (!ret) - dev_info(&mdev->dev, "MDEV: group_id = %d\n", - iommu_group_id(group)); - - iommu_group_put(group); - return ret; -} - -static void mdev_detach_iommu(struct mdev_device *mdev) -{ - iommu_group_remove_device(&mdev->dev); - dev_info(&mdev->dev, "MDEV: detaching iommu\n"); -} - static int mdev_probe(struct device *dev) { struct mdev_driver *drv = container_of(dev->driver, struct mdev_driver, driver); - struct mdev_device *mdev = to_mdev_device(dev); - int ret; - ret = mdev_attach_iommu(mdev); - if (ret) - return ret; - - if (drv->probe) { - ret = drv->probe(mdev); - if (ret) - mdev_detach_iommu(mdev); - } - - return ret; + if (!drv->probe) + return 0; + return drv->probe(to_mdev_device(dev)); } static void mdev_remove(struct device *dev) { struct mdev_driver *drv = container_of(dev->driver, struct mdev_driver, driver); - struct mdev_device *mdev = to_mdev_device(dev); if (drv->remove) - drv->remove(mdev); - - mdev_detach_iommu(mdev); + drv->remove(to_mdev_device(dev)); } static int mdev_match(struct device *dev, struct device_driver *drv) diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c index 7a9883048216..a90e24b0c851 100644 --- a/drivers/vfio/mdev/vfio_mdev.c +++ b/drivers/vfio/mdev/vfio_mdev.c @@ -119,7 +119,7 @@ static int vfio_mdev_probe(struct mdev_device *mdev) return -ENOMEM; vfio_init_group_dev(vdev, &mdev->dev, &vfio_mdev_dev_ops); - ret = vfio_register_group_dev(vdev); + ret = vfio_register_emulated_iommu_dev(vdev); if (ret) goto out_uninit; diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 23eaebd2e28c..2508c8c39840 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -67,6 +67,30 @@ struct vfio_unbound_dev { struct list_head unbound_next; }; +enum vfio_group_type { + /* + * Physical device with IOMMU backing. + */ + VFIO_IOMMU, + + /* + * Virtual device without IOMMU backing. The VFIO core fakes up an + * iommu_group as the iommu_group sysfs interface is part of the + * userspace ABI. The user of these devices must not be able to + * directly trigger unmediated DMA. + */ + VFIO_EMULATED_IOMMU, + + /* + * Physical device without IOMMU backing. The VFIO core fakes up an + * iommu_group as the iommu_group sysfs interface is part of the + * userspace ABI. Users can trigger unmediated DMA by the device, + * usage is highly dangerous, requires an explicit opt-in and will + * taint the kernel. + */ + VFIO_NO_IOMMU, +}; + struct vfio_group { struct kref kref; int minor; @@ -83,7 +107,7 @@ struct vfio_group { struct mutex unbound_lock; atomic_t opened; wait_queue_head_t container_q; - bool noiommu; + enum vfio_group_type type; unsigned int dev_counter; struct kvm *kvm; struct blocking_notifier_head notifier; @@ -336,7 +360,7 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) * Group objects - create, release, get, put, search */ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - bool noiommu) + enum vfio_group_type type) { struct vfio_group *group, *tmp; struct device *dev; @@ -355,7 +379,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, atomic_set(&group->opened, 0); init_waitqueue_head(&group->container_q); group->iommu_group = iommu_group; - group->noiommu = noiommu; + group->type = type; BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); group->nb.notifier_call = vfio_iommu_group_notifier; @@ -391,8 +415,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, } dev = device_create(vfio.class, NULL, - MKDEV(MAJOR(vfio.group_devt), minor), - group, "%s%d", group->noiommu ? "noiommu-" : "", + MKDEV(MAJOR(vfio.group_devt), minor), group, "%s%d", + group->type == VFIO_NO_IOMMU ? "noiommu-" : "", iommu_group_id(iommu_group)); if (IS_ERR(dev)) { vfio_free_group_minor(minor); @@ -778,8 +802,8 @@ void vfio_uninit_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_uninit_group_dev); -#ifdef CONFIG_VFIO_NOIOMMU -static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev) +static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, + enum vfio_group_type type) { struct iommu_group *iommu_group; struct vfio_group *group; @@ -794,7 +818,7 @@ static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev) if (ret) goto out_put_group; - group = vfio_create_group(iommu_group, true); + group = vfio_create_group(iommu_group, type); if (IS_ERR(group)) { ret = PTR_ERR(group); goto out_remove_device; @@ -808,7 +832,6 @@ out_put_group: iommu_group_put(iommu_group); return ERR_PTR(ret); } -#endif static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) { @@ -824,7 +847,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) * bus. Taint the kernel because we're about to give a DMA * capable device to a user without IOMMU protection. */ - group = vfio_noiommu_group_alloc(dev); + group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); if (!IS_ERR(group)) { add_taint(TAINT_USER, LOCKDEP_STILL_OK); dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); @@ -841,7 +864,7 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) goto out_put; /* a newly created vfio_group keeps the reference. */ - group = vfio_create_group(iommu_group, false); + group = vfio_create_group(iommu_group, VFIO_IOMMU); if (IS_ERR(group)) goto out_put; return group; @@ -851,10 +874,13 @@ out_put: return group; } -int vfio_register_group_dev(struct vfio_device *device) +static int __vfio_register_dev(struct vfio_device *device, + struct vfio_group *group) { struct vfio_device *existing_device; - struct vfio_group *group; + + if (IS_ERR(group)) + return PTR_ERR(group); /* * If the driver doesn't specify a set then the device is added to a @@ -863,16 +889,13 @@ int vfio_register_group_dev(struct vfio_device *device) if (!device->dev_set) vfio_assign_device_set(device, device); - group = vfio_group_find_or_alloc(device->dev); - if (IS_ERR(group)) - return PTR_ERR(group); - existing_device = vfio_group_get_device(group, device->dev); if (existing_device) { dev_WARN(device->dev, "Device already exists on group %d\n", iommu_group_id(group->iommu_group)); vfio_device_put(existing_device); - if (group->noiommu) + if (group->type == VFIO_NO_IOMMU || + group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); vfio_group_put(group); return -EBUSY; @@ -891,8 +914,25 @@ int vfio_register_group_dev(struct vfio_device *device) return 0; } + +int vfio_register_group_dev(struct vfio_device *device) +{ + return __vfio_register_dev(device, + vfio_group_find_or_alloc(device->dev)); +} EXPORT_SYMBOL_GPL(vfio_register_group_dev); +/* + * Register a virtual device without IOMMU backing. The user of this + * device must not be able to directly trigger unmediated DMA. + */ +int vfio_register_emulated_iommu_dev(struct vfio_device *device) +{ + return __vfio_register_dev(device, + vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU)); +} +EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); + /** * Get a reference to the vfio_device for a device. Even if the * caller thinks they own the device, they could be racing with a @@ -1019,7 +1059,7 @@ void vfio_unregister_group_dev(struct vfio_device *device) if (list_empty(&group->device_list)) wait_event(group->container_q, !group->container); - if (group->noiommu) + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) iommu_group_remove_device(device->dev); /* Matches the get in vfio_register_group_dev() */ @@ -1368,7 +1408,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) if (atomic_read(&group->container_users)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; f = fdget(container_fd); @@ -1388,7 +1428,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) /* Real groups and fake groups cannot mix */ if (!list_empty(&container->group_list) && - container->noiommu != group->noiommu) { + container->noiommu != (group->type == VFIO_NO_IOMMU)) { ret = -EPERM; goto unlock_out; } @@ -1402,7 +1442,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) } group->container = container; - container->noiommu = group->noiommu; + container->noiommu = (group->type == VFIO_NO_IOMMU); list_add(&group->container_next, &container->group_list); /* Get a reference on the container and mark a user within the group */ @@ -1426,7 +1466,7 @@ static int vfio_group_add_container_user(struct vfio_group *group) if (!atomic_inc_not_zero(&group->container_users)) return -EINVAL; - if (group->noiommu) { + if (group->type == VFIO_NO_IOMMU) { atomic_dec(&group->container_users); return -EPERM; } @@ -1451,7 +1491,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) !group->container->iommu_driver || !vfio_group_viable(group)) return -EINVAL; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) return -EPERM; device = vfio_device_get_from_name(group, buf); @@ -1498,7 +1538,7 @@ static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) fd_install(fdno, filep); - if (group->noiommu) + if (group->type == VFIO_NO_IOMMU) dev_warn(device->dev, "vfio-noiommu device opened by user " "(%s:%d)\n", current->comm, task_pid_nr(current)); return fdno; @@ -1594,7 +1634,7 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) if (!group) return -ENODEV; - if (group->noiommu && !capable(CAP_SYS_RAWIO)) { + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { vfio_group_put(group); return -EPERM; } diff --git a/include/linux/vfio.h b/include/linux/vfio.h index f7083c2fd0d0..bbe293008626 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -75,6 +75,7 @@ void vfio_init_group_dev(struct vfio_device *device, struct device *dev, const struct vfio_device_ops *ops); void vfio_uninit_group_dev(struct vfio_device *device); int vfio_register_group_dev(struct vfio_device *device); +int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); extern struct vfio_device *vfio_device_get_from_dev(struct device *dev); extern void vfio_device_put(struct vfio_device *device); diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index c313ab4d1f4e..cd41bec5fdeb 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -553,7 +553,7 @@ static int mbochs_probe(struct mdev_device *mdev) mbochs_create_config_space(mdev_state); mbochs_reset(mdev_state); - ret = vfio_register_group_dev(&mdev_state->vdev); + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); if (ret) goto err_mem; dev_set_drvdata(&mdev->dev, mdev_state); diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index 8d1a80a0722a..fe5d43e797b6 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -258,7 +258,7 @@ static int mdpy_probe(struct mdev_device *mdev) mdpy_count++; - ret = vfio_register_group_dev(&mdev_state->vdev); + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); if (ret) goto err_mem; dev_set_drvdata(&mdev->dev, mdev_state); diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index 5983cdb16e3d..a0e1a469bd47 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -741,7 +741,7 @@ static int mtty_probe(struct mdev_device *mdev) mtty_create_config_space(mdev_state); - ret = vfio_register_group_dev(&mdev_state->vdev); + ret = vfio_register_emulated_iommu_dev(&mdev_state->vdev); if (ret) goto err_vconfig; dev_set_drvdata(&mdev->dev, mdev_state); From 67462037872d5ca57dc4674cccff191947b9b43e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:58 +0200 Subject: [PATCH 10/27] vfio: remove unused method from vfio_iommu_driver_ops The read, write and mmap methods are never implemented, so remove them. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-9-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 50 -------------------------------------------- include/linux/vfio.h | 5 ----- 2 files changed, 55 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 2508c8c39840..2c1c7316aa19 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1276,62 +1276,12 @@ static int vfio_fops_release(struct inode *inode, struct file *filep) return 0; } -/* - * Once an iommu driver is set, we optionally pass read/write/mmap - * on to the driver, allowing management interfaces beyond ioctl. - */ -static ssize_t vfio_fops_read(struct file *filep, char __user *buf, - size_t count, loff_t *ppos) -{ - struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver; - ssize_t ret = -EINVAL; - - driver = container->iommu_driver; - if (likely(driver && driver->ops->read)) - ret = driver->ops->read(container->iommu_data, - buf, count, ppos); - - return ret; -} - -static ssize_t vfio_fops_write(struct file *filep, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver; - ssize_t ret = -EINVAL; - - driver = container->iommu_driver; - if (likely(driver && driver->ops->write)) - ret = driver->ops->write(container->iommu_data, - buf, count, ppos); - - return ret; -} - -static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma) -{ - struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver; - int ret = -EINVAL; - - driver = container->iommu_driver; - if (likely(driver && driver->ops->mmap)) - ret = driver->ops->mmap(container->iommu_data, vma); - - return ret; -} - static const struct file_operations vfio_fops = { .owner = THIS_MODULE, .open = vfio_fops_open, .release = vfio_fops_release, - .read = vfio_fops_read, - .write = vfio_fops_write, .unlocked_ioctl = vfio_fops_unl_ioctl, .compat_ioctl = compat_ptr_ioctl, - .mmap = vfio_fops_mmap, }; /** diff --git a/include/linux/vfio.h b/include/linux/vfio.h index bbe293008626..7a57a0077f96 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -95,13 +95,8 @@ struct vfio_iommu_driver_ops { struct module *owner; void *(*open)(unsigned long arg); void (*release)(void *iommu_data); - ssize_t (*read)(void *iommu_data, char __user *buf, - size_t count, loff_t *ppos); - ssize_t (*write)(void *iommu_data, const char __user *buf, - size_t count, loff_t *size); long (*ioctl)(void *iommu_data, unsigned int cmd, unsigned long arg); - int (*mmap)(void *iommu_data, struct vm_area_struct *vma); int (*attach_group)(void *iommu_data, struct iommu_group *group); void (*detach_group)(void *iommu_data, From 8cc02d22d7e1596ed687c4ff967c32056c2bef3e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:56:59 +0200 Subject: [PATCH 11/27] vfio: move the vfio_iommu_driver_ops interface out of Create a new private drivers/vfio/vfio.h header for the interface between the VFIO core and the iommu drivers. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-10-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 1 + drivers/vfio/vfio.h | 47 +++++++++++++++++++++++++++++ drivers/vfio/vfio_iommu_spapr_tce.c | 1 + drivers/vfio/vfio_iommu_type1.c | 1 + include/linux/vfio.h | 44 --------------------------- 5 files changed, 50 insertions(+), 44 deletions(-) create mode 100644 drivers/vfio/vfio.h diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 2c1c7316aa19..6589e296ef34 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -32,6 +32,7 @@ #include #include #include +#include "vfio.h" #define DRIVER_VERSION "0.3" #define DRIVER_AUTHOR "Alex Williamson " diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h new file mode 100644 index 000000000000..a78de649eb2f --- /dev/null +++ b/drivers/vfio/vfio.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson + */ + +/* events for the backend driver notify callback */ +enum vfio_iommu_notify_type { + VFIO_IOMMU_CONTAINER_CLOSE = 0, +}; + +/** + * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks + */ +struct vfio_iommu_driver_ops { + char *name; + struct module *owner; + void *(*open)(unsigned long arg); + void (*release)(void *iommu_data); + long (*ioctl)(void *iommu_data, unsigned int cmd, + unsigned long arg); + int (*attach_group)(void *iommu_data, + struct iommu_group *group); + void (*detach_group)(void *iommu_data, + struct iommu_group *group); + int (*pin_pages)(void *iommu_data, + struct iommu_group *group, + unsigned long *user_pfn, + int npage, int prot, + unsigned long *phys_pfn); + int (*unpin_pages)(void *iommu_data, + unsigned long *user_pfn, int npage); + int (*register_notifier)(void *iommu_data, + unsigned long *events, + struct notifier_block *nb); + int (*unregister_notifier)(void *iommu_data, + struct notifier_block *nb); + int (*dma_rw)(void *iommu_data, dma_addr_t user_iova, + void *data, size_t count, bool write); + struct iommu_domain *(*group_iommu_domain)(void *iommu_data, + struct iommu_group *group); + void (*notify)(void *iommu_data, + enum vfio_iommu_notify_type event); +}; + +int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); +void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index fe888b5dcc00..3efd09faeca4 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -20,6 +20,7 @@ #include #include #include +#include "vfio.h" #include #include diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0e9217687f5c..2e51e4390c15 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -40,6 +40,7 @@ #include #include #include +#include "vfio.h" #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson " diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 7a57a0077f96..76191d7abed1 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -82,50 +82,6 @@ extern void vfio_device_put(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); -/* events for the backend driver notify callback */ -enum vfio_iommu_notify_type { - VFIO_IOMMU_CONTAINER_CLOSE = 0, -}; - -/** - * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks - */ -struct vfio_iommu_driver_ops { - char *name; - struct module *owner; - void *(*open)(unsigned long arg); - void (*release)(void *iommu_data); - long (*ioctl)(void *iommu_data, unsigned int cmd, - unsigned long arg); - int (*attach_group)(void *iommu_data, - struct iommu_group *group); - void (*detach_group)(void *iommu_data, - struct iommu_group *group); - int (*pin_pages)(void *iommu_data, - struct iommu_group *group, - unsigned long *user_pfn, - int npage, int prot, - unsigned long *phys_pfn); - int (*unpin_pages)(void *iommu_data, - unsigned long *user_pfn, int npage); - int (*register_notifier)(void *iommu_data, - unsigned long *events, - struct notifier_block *nb); - int (*unregister_notifier)(void *iommu_data, - struct notifier_block *nb); - int (*dma_rw)(void *iommu_data, dma_addr_t user_iova, - void *data, size_t count, bool write); - struct iommu_domain *(*group_iommu_domain)(void *iommu_data, - struct iommu_group *group); - void (*notify)(void *iommu_data, - enum vfio_iommu_notify_type event); -}; - -extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); - -extern void vfio_unregister_iommu_driver( - const struct vfio_iommu_driver_ops *ops); - /* * External user API */ From fda49d97f2c4faf0b42e8796d3e6c868d992f3af Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:57:00 +0200 Subject: [PATCH 12/27] vfio: remove the unused mdev iommu hook The iommu_device field in struct mdev_device has never been used since it was added more than 2 years ago. This is a manual revert of commit 7bd50f0cd2 ("vfio/type1: Add domain at(de)taching group helpers"). Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-11-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 133 +++++++------------------------- include/linux/mdev.h | 20 ----- 2 files changed, 26 insertions(+), 127 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 2e51e4390c15..42a6be1fb726 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -114,7 +114,6 @@ struct vfio_batch { struct vfio_iommu_group { struct iommu_group *iommu_group; struct list_head next; - bool mdev_group; /* An mdev group */ bool pinned_page_dirty_scope; }; @@ -1935,61 +1934,6 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, return ret; } -static int vfio_mdev_attach_domain(struct device *dev, void *data) -{ - struct mdev_device *mdev = to_mdev_device(dev); - struct iommu_domain *domain = data; - struct device *iommu_device; - - iommu_device = mdev_get_iommu_device(mdev); - if (iommu_device) { - if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX)) - return iommu_aux_attach_device(domain, iommu_device); - else - return iommu_attach_device(domain, iommu_device); - } - - return -EINVAL; -} - -static int vfio_mdev_detach_domain(struct device *dev, void *data) -{ - struct mdev_device *mdev = to_mdev_device(dev); - struct iommu_domain *domain = data; - struct device *iommu_device; - - iommu_device = mdev_get_iommu_device(mdev); - if (iommu_device) { - if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX)) - iommu_aux_detach_device(domain, iommu_device); - else - iommu_detach_device(domain, iommu_device); - } - - return 0; -} - -static int vfio_iommu_attach_group(struct vfio_domain *domain, - struct vfio_iommu_group *group) -{ - if (group->mdev_group) - return iommu_group_for_each_dev(group->iommu_group, - domain->domain, - vfio_mdev_attach_domain); - else - return iommu_attach_group(domain->domain, group->iommu_group); -} - -static void vfio_iommu_detach_group(struct vfio_domain *domain, - struct vfio_iommu_group *group) -{ - if (group->mdev_group) - iommu_group_for_each_dev(group->iommu_group, domain->domain, - vfio_mdev_detach_domain); - else - iommu_detach_group(domain->domain, group->iommu_group); -} - static bool vfio_bus_is_mdev(struct bus_type *bus) { struct bus_type *mdev_bus; @@ -2004,20 +1948,6 @@ static bool vfio_bus_is_mdev(struct bus_type *bus) return ret; } -static int vfio_mdev_iommu_device(struct device *dev, void *data) -{ - struct mdev_device *mdev = to_mdev_device(dev); - struct device **old = data, *new; - - new = mdev_get_iommu_device(mdev); - if (!new || (*old && *old != new)) - return -EINVAL; - - *old = new; - - return 0; -} - /* * This is a helper function to insert an address range to iova list. * The list is initially created with a single entry corresponding to @@ -2278,38 +2208,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_free; if (vfio_bus_is_mdev(bus)) { - struct device *iommu_device = NULL; - - group->mdev_group = true; - - /* Determine the isolation type */ - ret = iommu_group_for_each_dev(iommu_group, &iommu_device, - vfio_mdev_iommu_device); - if (ret || !iommu_device) { - if (!iommu->external_domain) { - INIT_LIST_HEAD(&domain->group_list); - iommu->external_domain = domain; - vfio_update_pgsize_bitmap(iommu); - } else { - kfree(domain); - } - - list_add(&group->next, - &iommu->external_domain->group_list); - /* - * Non-iommu backed group cannot dirty memory directly, - * it can only use interfaces that provide dirty - * tracking. - * The iommu scope can only be promoted with the - * addition of a dirty tracking group. - */ - group->pinned_page_dirty_scope = true; - mutex_unlock(&iommu->lock); - - return 0; + if (!iommu->external_domain) { + INIT_LIST_HEAD(&domain->group_list); + iommu->external_domain = domain; + vfio_update_pgsize_bitmap(iommu); + } else { + kfree(domain); } - bus = iommu_device->bus; + list_add(&group->next, &iommu->external_domain->group_list); + /* + * Non-iommu backed group cannot dirty memory directly, it can + * only use interfaces that provide dirty tracking. + * The iommu scope can only be promoted with the addition of a + * dirty tracking group. + */ + group->pinned_page_dirty_scope = true; + mutex_unlock(&iommu->lock); + + return 0; } domain->domain = iommu_domain_alloc(bus); @@ -2324,7 +2241,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, goto out_domain; } - ret = vfio_iommu_attach_group(domain, group); + ret = iommu_attach_group(domain->domain, group->iommu_group); if (ret) goto out_domain; @@ -2391,15 +2308,17 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, list_for_each_entry(d, &iommu->domain_list, next) { if (d->domain->ops == domain->domain->ops && d->prot == domain->prot) { - vfio_iommu_detach_group(domain, group); - if (!vfio_iommu_attach_group(d, group)) { + iommu_detach_group(domain->domain, group->iommu_group); + if (!iommu_attach_group(d->domain, + group->iommu_group)) { list_add(&group->next, &d->group_list); iommu_domain_free(domain->domain); kfree(domain); goto done; } - ret = vfio_iommu_attach_group(domain, group); + ret = iommu_attach_group(domain->domain, + group->iommu_group); if (ret) goto out_domain; } @@ -2436,7 +2355,7 @@ done: return 0; out_detach: - vfio_iommu_detach_group(domain, group); + iommu_detach_group(domain->domain, group->iommu_group); out_domain: iommu_domain_free(domain->domain); vfio_iommu_iova_free(&iova_copy); @@ -2601,7 +2520,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, if (!group) continue; - vfio_iommu_detach_group(domain, group); + iommu_detach_group(domain->domain, group->iommu_group); update_dirty_scope = !group->pinned_page_dirty_scope; list_del(&group->next); kfree(group); @@ -2689,7 +2608,7 @@ static void vfio_release_domain(struct vfio_domain *domain, bool external) list_for_each_entry_safe(group, group_tmp, &domain->group_list, next) { if (!external) - vfio_iommu_detach_group(domain, group); + iommu_detach_group(domain->domain, group->iommu_group); list_del(&group->next); kfree(group); } diff --git a/include/linux/mdev.h b/include/linux/mdev.h index 68427e8fadeb..15d03f6532d0 100644 --- a/include/linux/mdev.h +++ b/include/linux/mdev.h @@ -18,7 +18,6 @@ struct mdev_device { void *driver_data; struct list_head next; struct mdev_type *type; - struct device *iommu_device; bool active; }; @@ -27,25 +26,6 @@ static inline struct mdev_device *to_mdev_device(struct device *dev) return container_of(dev, struct mdev_device, dev); } -/* - * Called by the parent device driver to set the device which represents - * this mdev in iommu protection scope. By default, the iommu device is - * NULL, that indicates using vendor defined isolation. - * - * @dev: the mediated device that iommu will isolate. - * @iommu_device: a pci device which represents the iommu for @dev. - */ -static inline void mdev_set_iommu_device(struct mdev_device *mdev, - struct device *iommu_device) -{ - mdev->iommu_device = iommu_device; -} - -static inline struct device *mdev_get_iommu_device(struct mdev_device *mdev) -{ - return mdev->iommu_device; -} - unsigned int mdev_get_type_group_id(struct mdev_device *mdev); unsigned int mtype_get_type_group_id(struct mdev_type *mtype); struct device *mtype_get_parent_dev(struct mdev_type *mtype); From c3c0fa9d94f7078b6664d29d79484d071a144f2e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:57:01 +0200 Subject: [PATCH 13/27] vfio: clean up the check for mediated device in vfio_iommu_type1 Pass the group flags to ->attach_group and remove the messy check for the bus type. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-12-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 32 +++++------------------------ drivers/vfio/vfio.h | 27 +++++++++++++++++++++++- drivers/vfio/vfio_iommu_spapr_tce.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 19 ++--------------- 4 files changed, 34 insertions(+), 46 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 6589e296ef34..08b27b64f0f9 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -68,30 +68,6 @@ struct vfio_unbound_dev { struct list_head unbound_next; }; -enum vfio_group_type { - /* - * Physical device with IOMMU backing. - */ - VFIO_IOMMU, - - /* - * Virtual device without IOMMU backing. The VFIO core fakes up an - * iommu_group as the iommu_group sysfs interface is part of the - * userspace ABI. The user of these devices must not be able to - * directly trigger unmediated DMA. - */ - VFIO_EMULATED_IOMMU, - - /* - * Physical device without IOMMU backing. The VFIO core fakes up an - * iommu_group as the iommu_group sysfs interface is part of the - * userspace ABI. Users can trigger unmediated DMA by the device, - * usage is highly dangerous, requires an explicit opt-in and will - * taint the kernel. - */ - VFIO_NO_IOMMU, -}; - struct vfio_group { struct kref kref; int minor; @@ -219,7 +195,7 @@ static long vfio_noiommu_ioctl(void *iommu_data, } static int vfio_noiommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) + struct iommu_group *iommu_group, enum vfio_group_type type) { return 0; } @@ -1129,7 +1105,8 @@ static int __vfio_container_attach_groups(struct vfio_container *container, int ret = -ENODEV; list_for_each_entry(group, &container->group_list, container_next) { - ret = driver->ops->attach_group(data, group->iommu_group); + ret = driver->ops->attach_group(data, group->iommu_group, + group->type); if (ret) goto unwind; } @@ -1387,7 +1364,8 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd) driver = container->iommu_driver; if (driver) { ret = driver->ops->attach_group(container->iommu_data, - group->iommu_group); + group->iommu_group, + group->type); if (ret) goto unlock_out; } diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index a78de649eb2f..a67130221151 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -4,6 +4,30 @@ * Author: Alex Williamson */ +enum vfio_group_type { + /* + * Physical device with IOMMU backing. + */ + VFIO_IOMMU, + + /* + * Virtual device without IOMMU backing. The VFIO core fakes up an + * iommu_group as the iommu_group sysfs interface is part of the + * userspace ABI. The user of these devices must not be able to + * directly trigger unmediated DMA. + */ + VFIO_EMULATED_IOMMU, + + /* + * Physical device without IOMMU backing. The VFIO core fakes up an + * iommu_group as the iommu_group sysfs interface is part of the + * userspace ABI. Users can trigger unmediated DMA by the device, + * usage is highly dangerous, requires an explicit opt-in and will + * taint the kernel. + */ + VFIO_NO_IOMMU, +}; + /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, @@ -20,7 +44,8 @@ struct vfio_iommu_driver_ops { long (*ioctl)(void *iommu_data, unsigned int cmd, unsigned long arg); int (*attach_group)(void *iommu_data, - struct iommu_group *group); + struct iommu_group *group, + enum vfio_group_type); void (*detach_group)(void *iommu_data, struct iommu_group *group); int (*pin_pages)(void *iommu_data, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 3efd09faeca4..936a26b13c0b 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -1239,7 +1239,7 @@ release_exit: } static int tce_iommu_attach_group(void *iommu_data, - struct iommu_group *iommu_group) + struct iommu_group *iommu_group, enum vfio_group_type type) { int ret = 0; struct tce_container *container = iommu_data; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 42a6be1fb726..a48e9f597cb2 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -1934,20 +1933,6 @@ static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, return ret; } -static bool vfio_bus_is_mdev(struct bus_type *bus) -{ - struct bus_type *mdev_bus; - bool ret = false; - - mdev_bus = symbol_get(mdev_bus_type); - if (mdev_bus) { - ret = (bus == mdev_bus); - symbol_put(mdev_bus_type); - } - - return ret; -} - /* * This is a helper function to insert an address range to iova list. * The list is initially created with a single entry corresponding to @@ -2172,7 +2157,7 @@ static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu, } static int vfio_iommu_type1_attach_group(void *iommu_data, - struct iommu_group *iommu_group) + struct iommu_group *iommu_group, enum vfio_group_type type) { struct vfio_iommu *iommu = iommu_data; struct vfio_iommu_group *group; @@ -2207,7 +2192,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_free; - if (vfio_bus_is_mdev(bus)) { + if (type == VFIO_EMULATED_IOMMU) { if (!iommu->external_domain) { INIT_LIST_HEAD(&domain->group_list); iommu->external_domain = domain; From 898639041484003e219e35fd1623b4bb1f4c0d04 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:57:02 +0200 Subject: [PATCH 14/27] vfio/spapr_tce: reject mediated devices Unlike the the type1 IOMMU backend, the SPAPR one does not contain any support for the magic non-IOMMU backed iommu_group used by mediated devices, so reject them in ->attach_group. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-13-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_spapr_tce.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 936a26b13c0b..708a95e61831 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -1246,6 +1246,9 @@ static int tce_iommu_attach_group(void *iommu_data, struct iommu_table_group *table_group; struct tce_iommu_group *tcegrp = NULL; + if (type == VFIO_EMULATED_IOMMU) + return -EINVAL; + mutex_lock(&container->lock); /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", From 65cdbf10633783647c45eb22f0e70f193fee3789 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:57:03 +0200 Subject: [PATCH 15/27] vfio/iommu_type1: initialize pgsize_bitmap in ->open Ensure pgsize_bitmap is always valid by initializing it to PAGE_MASK in vfio_iommu_type1_open and remove the now pointless update for the external domain case in vfio_iommu_type1_attach_group, which was just setting pgsize_bitmap to PAGE_MASK when only external domains were attached. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20210924155705.4258-14-hch@lst.de [aw: s/ULONG_MAX/PAGE_MASK/ per discussion in link] Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index a48e9f597cb2..27539603eb93 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2196,7 +2196,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (!iommu->external_domain) { INIT_LIST_HEAD(&domain->group_list); iommu->external_domain = domain; - vfio_update_pgsize_bitmap(iommu); } else { kfree(domain); } @@ -2582,6 +2581,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) mutex_init(&iommu->lock); BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); init_waitqueue_head(&iommu->vaddr_wait); + iommu->pgsize_bitmap = PAGE_MASK; return iommu; } From 296e505baddf92ed73f6ca2ddbb426357bb5abc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:57:04 +0200 Subject: [PATCH 16/27] vfio/iommu_type1: remove the "external" domain The external_domain concept rather misleading and not actually needed. Replace it with a list of emulated groups in struct vfio_iommu. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-15-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 120 ++++++++++++++------------------ 1 file changed, 54 insertions(+), 66 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 27539603eb93..caf638cedeba 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -65,7 +65,6 @@ MODULE_PARM_DESC(dma_entry_limit, struct vfio_iommu { struct list_head domain_list; struct list_head iova_list; - struct vfio_domain *external_domain; /* domain for external user */ struct mutex lock; struct rb_root dma_list; struct blocking_notifier_head notifier; @@ -78,6 +77,7 @@ struct vfio_iommu { bool nesting; bool dirty_page_tracking; bool container_open; + struct list_head emulated_iommu_groups; }; struct vfio_domain { @@ -1892,8 +1892,8 @@ static struct vfio_iommu_group* vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, struct iommu_group *iommu_group) { + struct vfio_iommu_group *group; struct vfio_domain *domain; - struct vfio_iommu_group *group = NULL; list_for_each_entry(domain, &iommu->domain_list, next) { group = find_iommu_group(domain, iommu_group); @@ -1901,10 +1901,10 @@ vfio_iommu_find_iommu_group(struct vfio_iommu *iommu, return group; } - if (iommu->external_domain) - group = find_iommu_group(iommu->external_domain, iommu_group); - - return group; + list_for_each_entry(group, &iommu->emulated_iommu_groups, next) + if (group->iommu_group == iommu_group) + return group; + return NULL; } static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions, @@ -2163,61 +2163,52 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, struct vfio_iommu_group *group; struct vfio_domain *domain, *d; struct bus_type *bus = NULL; - int ret; bool resv_msi, msi_remap; phys_addr_t resv_msi_base = 0; struct iommu_domain_geometry *geo; LIST_HEAD(iova_copy); LIST_HEAD(group_resv_regions); + int ret = -EINVAL; mutex_lock(&iommu->lock); /* Check for duplicates */ - if (vfio_iommu_find_iommu_group(iommu, iommu_group)) { - mutex_unlock(&iommu->lock); - return -EINVAL; - } + if (vfio_iommu_find_iommu_group(iommu, iommu_group)) + goto out_unlock; + ret = -ENOMEM; group = kzalloc(sizeof(*group), GFP_KERNEL); - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!group || !domain) { - ret = -ENOMEM; - goto out_free; - } - + if (!group) + goto out_unlock; group->iommu_group = iommu_group; - /* Determine bus_type in order to allocate a domain */ - ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type); - if (ret) - goto out_free; - if (type == VFIO_EMULATED_IOMMU) { - if (!iommu->external_domain) { - INIT_LIST_HEAD(&domain->group_list); - iommu->external_domain = domain; - } else { - kfree(domain); - } - - list_add(&group->next, &iommu->external_domain->group_list); + list_add(&group->next, &iommu->emulated_iommu_groups); /* - * Non-iommu backed group cannot dirty memory directly, it can + * An emulated IOMMU group cannot dirty memory directly, it can * only use interfaces that provide dirty tracking. * The iommu scope can only be promoted with the addition of a * dirty tracking group. */ group->pinned_page_dirty_scope = true; - mutex_unlock(&iommu->lock); - - return 0; + ret = 0; + goto out_unlock; } + /* Determine bus_type in order to allocate a domain */ + ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type); + if (ret) + goto out_free_group; + + ret = -ENOMEM; + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + goto out_free_group; + + ret = -EIO; domain->domain = iommu_domain_alloc(bus); - if (!domain->domain) { - ret = -EIO; - goto out_free; - } + if (!domain->domain) + goto out_free_domain; if (iommu->nesting) { ret = iommu_enable_nesting(domain->domain); @@ -2344,9 +2335,11 @@ out_domain: iommu_domain_free(domain->domain); vfio_iommu_iova_free(&iova_copy); vfio_iommu_resv_free(&group_resv_regions); -out_free: +out_free_domain: kfree(domain); +out_free_group: kfree(group); +out_unlock: mutex_unlock(&iommu->lock); return ret; } @@ -2471,25 +2464,19 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, LIST_HEAD(iova_copy); mutex_lock(&iommu->lock); + list_for_each_entry(group, &iommu->emulated_iommu_groups, next) { + if (group->iommu_group != iommu_group) + continue; + update_dirty_scope = !group->pinned_page_dirty_scope; + list_del(&group->next); + kfree(group); - if (iommu->external_domain) { - group = find_iommu_group(iommu->external_domain, iommu_group); - if (group) { - update_dirty_scope = !group->pinned_page_dirty_scope; - list_del(&group->next); - kfree(group); - - if (list_empty(&iommu->external_domain->group_list)) { - if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { - WARN_ON(iommu->notifier.head); - vfio_iommu_unmap_unpin_all(iommu); - } - - kfree(iommu->external_domain); - iommu->external_domain = NULL; - } - goto detach_group_done; + if (list_empty(&iommu->emulated_iommu_groups) && + !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { + WARN_ON(iommu->notifier.head); + vfio_iommu_unmap_unpin_all(iommu); } + goto detach_group_done; } /* @@ -2517,7 +2504,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, */ if (list_empty(&domain->group_list)) { if (list_is_singular(&iommu->domain_list)) { - if (!iommu->external_domain) { + if (list_empty(&iommu->emulated_iommu_groups)) { WARN_ON(iommu->notifier.head); vfio_iommu_unmap_unpin_all(iommu); } else { @@ -2582,41 +2569,42 @@ static void *vfio_iommu_type1_open(unsigned long arg) BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); init_waitqueue_head(&iommu->vaddr_wait); iommu->pgsize_bitmap = PAGE_MASK; + INIT_LIST_HEAD(&iommu->emulated_iommu_groups); return iommu; } -static void vfio_release_domain(struct vfio_domain *domain, bool external) +static void vfio_release_domain(struct vfio_domain *domain) { struct vfio_iommu_group *group, *group_tmp; list_for_each_entry_safe(group, group_tmp, &domain->group_list, next) { - if (!external) - iommu_detach_group(domain->domain, group->iommu_group); + iommu_detach_group(domain->domain, group->iommu_group); list_del(&group->next); kfree(group); } - if (!external) - iommu_domain_free(domain->domain); + iommu_domain_free(domain->domain); } static void vfio_iommu_type1_release(void *iommu_data) { struct vfio_iommu *iommu = iommu_data; struct vfio_domain *domain, *domain_tmp; + struct vfio_iommu_group *group, *next_group; - if (iommu->external_domain) { - vfio_release_domain(iommu->external_domain, true); - kfree(iommu->external_domain); + list_for_each_entry_safe(group, next_group, + &iommu->emulated_iommu_groups, next) { + list_del(&group->next); + kfree(group); } vfio_iommu_unmap_unpin_all(iommu); list_for_each_entry_safe(domain, domain_tmp, &iommu->domain_list, next) { - vfio_release_domain(domain, false); + vfio_release_domain(domain); list_del(&domain->next); kfree(domain); } From 3f901389fa88d32d660cf2209aac6e395902d017 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Sep 2021 17:57:05 +0200 Subject: [PATCH 17/27] vfio/iommu_type1: remove IS_IOMMU_CAP_DOMAIN_IN_CONTAINER IS_IOMMU_CAP_DOMAIN_IN_CONTAINER just obsfucated the checks being performed, so open code it in the callers. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20210924155705.4258-16-hch@lst.de Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index caf638cedeba..f17490ab238f 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -139,9 +139,6 @@ struct vfio_regions { size_t len; }; -#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ - (!list_empty(&iommu->domain_list)) - #define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE) /* @@ -879,7 +876,7 @@ again: * already pinned and accounted. Accounting should be done if there is no * iommu capable domain in the container. */ - do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); + do_accounting = list_empty(&iommu->domain_list); for (i = 0; i < npage; i++) { struct vfio_pfn *vpfn; @@ -968,7 +965,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data, mutex_lock(&iommu->lock); - do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); + do_accounting = list_empty(&iommu->domain_list); for (i = 0; i < npage; i++) { struct vfio_dma *dma; dma_addr_t iova; @@ -1089,7 +1086,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, if (!dma->size) return 0; - if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) + if (list_empty(&iommu->domain_list)) return 0; /* @@ -1666,7 +1663,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, vfio_link_dma(iommu, dma); /* Don't pin and map if container doesn't contain IOMMU capable domain*/ - if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) + if (list_empty(&iommu->domain_list)) dma->size = size; else ret = vfio_pin_map_dma(iommu, dma, size); @@ -2472,7 +2469,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, kfree(group); if (list_empty(&iommu->emulated_iommu_groups) && - !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) { + list_empty(&iommu->domain_list)) { WARN_ON(iommu->notifier.head); vfio_iommu_unmap_unpin_all(iommu); } From 49ba1a2976c8305647f772eec04c9596c12c6d3f Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Tue, 12 Oct 2021 20:48:55 +0800 Subject: [PATCH 18/27] vfio/pci: Add OpRegion 2.0+ Extended VBT support. Due to historical reason, some legacy shipped system doesn't follow OpRegion 2.1 spec but still stick to OpRegion 2.0, in which the extended VBT is not contiguous after OpRegion in physical address, but any location pointed by RVDA via absolute address. Also although current OpRegion 2.1+ systems appears that the extended VBT follows OpRegion, RVDA is the relative address to OpRegion head, the extended VBT location may change to non-contiguous to OpRegion. In both cases, it's impossible to map a contiguous range to hold both OpRegion and the extended VBT and expose via one vfio region. The only difference between OpRegion 2.0 and 2.1 is where extended VBT is stored: For 2.0, RVDA is the absolute address of extended VBT while for 2.1, RVDA is the relative address of extended VBT to OpRegion baes, and there is no other difference between OpRegion 2.0 and 2.1. To support the non-contiguous region case as described, the updated read op will patch OpRegion version and RVDA on-the-fly accordingly. So that from vfio igd OpRegion view, only 2.1+ with contiguous extended VBT after OpRegion is exposed, regardless the underneath host OpRegion is 2.0 or 2.1+. The mechanism makes it possible to support legacy OpRegion 2.0 extended VBT systems with on the market, and support OpRegion 2.1+ where the extended VBT isn't contiguous after OpRegion. Cc: Zhenyu Wang Cc: Hang Yuan Cc: Swee Yee Fonn Cc: Fred Gao Signed-off-by: Colin Xu Link: https://lore.kernel.org/r/20211012124855.52463-1-colin.xu@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_igd.c | 236 ++++++++++++++++++++++++-------- 1 file changed, 176 insertions(+), 60 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 7ca4109bba48..56cd551e0e04 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -25,20 +25,121 @@ #define OPREGION_RVDS 0x3c2 #define OPREGION_VERSION 0x16 +struct igd_opregion_vbt { + void *opregion; + void *vbt_ex; +}; + +/** + * igd_opregion_shift_copy() - Copy OpRegion to user buffer and shift position. + * @dst: User buffer ptr to copy to. + * @off: Offset to user buffer ptr. Increased by bytes on return. + * @src: Source buffer to copy from. + * @pos: Increased by bytes on return. + * @remaining: Decreased by bytes on return. + * @bytes: Bytes to copy and adjust off, pos and remaining. + * + * Copy OpRegion to offset from specific source ptr and shift the offset. + * + * Return: 0 on success, -EFAULT otherwise. + * + */ +static inline unsigned long igd_opregion_shift_copy(char __user *dst, + loff_t *off, + void *src, + loff_t *pos, + size_t *remaining, + size_t bytes) +{ + if (copy_to_user(dst + (*off), src, bytes)) + return -EFAULT; + + *off += bytes; + *pos += bytes; + *remaining -= bytes; + + return 0; +} + static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; - void *base = vdev->region[i].data; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + struct igd_opregion_vbt *opregionvbt = vdev->region[i].data; + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK, off = 0; + size_t remaining; if (pos >= vdev->region[i].size || iswrite) return -EINVAL; - count = min(count, (size_t)(vdev->region[i].size - pos)); + count = min_t(size_t, count, vdev->region[i].size - pos); + remaining = count; - if (copy_to_user(buf, base + pos, count)) + /* Copy until OpRegion version */ + if (remaining && pos < OPREGION_VERSION) { + size_t bytes = min_t(size_t, remaining, OPREGION_VERSION - pos); + + if (igd_opregion_shift_copy(buf, &off, + opregionvbt->opregion + pos, &pos, + &remaining, bytes)) + return -EFAULT; + } + + /* Copy patched (if necessary) OpRegion version */ + if (remaining && pos < OPREGION_VERSION + sizeof(__le16)) { + size_t bytes = min_t(size_t, remaining, + OPREGION_VERSION + sizeof(__le16) - pos); + __le16 version = *(__le16 *)(opregionvbt->opregion + + OPREGION_VERSION); + + /* Patch to 2.1 if OpRegion 2.0 has extended VBT */ + if (le16_to_cpu(version) == 0x0200 && opregionvbt->vbt_ex) + version = cpu_to_le16(0x0201); + + if (igd_opregion_shift_copy(buf, &off, + &version + (pos - OPREGION_VERSION), + &pos, &remaining, bytes)) + return -EFAULT; + } + + /* Copy until RVDA */ + if (remaining && pos < OPREGION_RVDA) { + size_t bytes = min_t(size_t, remaining, OPREGION_RVDA - pos); + + if (igd_opregion_shift_copy(buf, &off, + opregionvbt->opregion + pos, &pos, + &remaining, bytes)) + return -EFAULT; + } + + /* Copy modified (if necessary) RVDA */ + if (remaining && pos < OPREGION_RVDA + sizeof(__le64)) { + size_t bytes = min_t(size_t, remaining, + OPREGION_RVDA + sizeof(__le64) - pos); + __le64 rvda = cpu_to_le64(opregionvbt->vbt_ex ? + OPREGION_SIZE : 0); + + if (igd_opregion_shift_copy(buf, &off, + &rvda + (pos - OPREGION_RVDA), + &pos, &remaining, bytes)) + return -EFAULT; + } + + /* Copy the rest of OpRegion */ + if (remaining && pos < OPREGION_SIZE) { + size_t bytes = min_t(size_t, remaining, OPREGION_SIZE - pos); + + if (igd_opregion_shift_copy(buf, &off, + opregionvbt->opregion + pos, &pos, + &remaining, bytes)) + return -EFAULT; + } + + /* Copy extended VBT if exists */ + if (remaining && + copy_to_user(buf + off, opregionvbt->vbt_ex + (pos - OPREGION_SIZE), + remaining)) return -EFAULT; *ppos += count; @@ -49,7 +150,13 @@ static ssize_t vfio_pci_igd_rw(struct vfio_pci_core_device *vdev, static void vfio_pci_igd_release(struct vfio_pci_core_device *vdev, struct vfio_pci_region *region) { - memunmap(region->data); + struct igd_opregion_vbt *opregionvbt = region->data; + + if (opregionvbt->vbt_ex) + memunmap(opregionvbt->vbt_ex); + + memunmap(opregionvbt->opregion); + kfree(opregionvbt); } static const struct vfio_pci_regops vfio_pci_igd_regops = { @@ -61,7 +168,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev) { __le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR); u32 addr, size; - void *base; + struct igd_opregion_vbt *opregionvbt; int ret; u16 version; @@ -72,84 +179,93 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev) if (!addr || !(~addr)) return -ENODEV; - base = memremap(addr, OPREGION_SIZE, MEMREMAP_WB); - if (!base) + opregionvbt = kzalloc(sizeof(*opregionvbt), GFP_KERNEL); + if (!opregionvbt) return -ENOMEM; - if (memcmp(base, OPREGION_SIGNATURE, 16)) { - memunmap(base); + opregionvbt->opregion = memremap(addr, OPREGION_SIZE, MEMREMAP_WB); + if (!opregionvbt->opregion) { + kfree(opregionvbt); + return -ENOMEM; + } + + if (memcmp(opregionvbt->opregion, OPREGION_SIGNATURE, 16)) { + memunmap(opregionvbt->opregion); + kfree(opregionvbt); return -EINVAL; } - size = le32_to_cpu(*(__le32 *)(base + 16)); + size = le32_to_cpu(*(__le32 *)(opregionvbt->opregion + 16)); if (!size) { - memunmap(base); + memunmap(opregionvbt->opregion); + kfree(opregionvbt); return -EINVAL; } size *= 1024; /* In KB */ /* - * Support opregion v2.1+ - * When VBT data exceeds 6KB size and cannot be within mailbox #4, then - * the Extended VBT region next to opregion is used to hold the VBT data. - * RVDA (Relative Address of VBT Data from Opregion Base) and RVDS - * (Raw VBT Data Size) from opregion structure member are used to hold the - * address from region base and size of VBT data. RVDA/RVDS are not - * defined before opregion 2.0. + * OpRegion and VBT: + * When VBT data doesn't exceed 6KB, it's stored in Mailbox #4. + * When VBT data exceeds 6KB size, Mailbox #4 is no longer large enough + * to hold the VBT data, the Extended VBT region is introduced since + * OpRegion 2.0 to hold the VBT data. Since OpRegion 2.0, RVDA/RVDS are + * introduced to define the extended VBT data location and size. + * OpRegion 2.0: RVDA defines the absolute physical address of the + * extended VBT data, RVDS defines the VBT data size. + * OpRegion 2.1 and above: RVDA defines the relative address of the + * extended VBT data to OpRegion base, RVDS defines the VBT data size. * - * opregion 2.1+: RVDA is unsigned, relative offset from - * opregion base, and should point to the end of opregion. - * otherwise, exposing to userspace to allow read access to everything between - * the OpRegion and VBT is not safe. - * RVDS is defined as size in bytes. - * - * opregion 2.0: rvda is the physical VBT address. - * Since rvda is HPA it cannot be directly used in guest. - * And it should not be practically available for end user,so it is not supported. + * Due to the RVDA definition diff in OpRegion VBT (also the only diff + * between 2.0 and 2.1), exposing OpRegion and VBT as a contiguous range + * for OpRegion 2.0 and above makes it possible to support the + * non-contiguous VBT through a single vfio region. From r/w ops view, + * only contiguous VBT after OpRegion with version 2.1+ is exposed, + * regardless the host OpRegion is 2.0 or non-contiguous 2.1+. The r/w + * ops will on-the-fly shift the actural offset into VBT so that data at + * correct position can be returned to the requester. */ - version = le16_to_cpu(*(__le16 *)(base + OPREGION_VERSION)); + version = le16_to_cpu(*(__le16 *)(opregionvbt->opregion + + OPREGION_VERSION)); if (version >= 0x0200) { - u64 rvda; - u32 rvds; + u64 rvda = le64_to_cpu(*(__le64 *)(opregionvbt->opregion + + OPREGION_RVDA)); + u32 rvds = le32_to_cpu(*(__le32 *)(opregionvbt->opregion + + OPREGION_RVDS)); - rvda = le64_to_cpu(*(__le64 *)(base + OPREGION_RVDA)); - rvds = le32_to_cpu(*(__le32 *)(base + OPREGION_RVDS)); + /* The extended VBT is valid only when RVDA/RVDS are non-zero */ if (rvda && rvds) { - /* no support for opregion v2.0 with physical VBT address */ - if (version == 0x0200) { - memunmap(base); - pci_err(vdev->pdev, - "IGD assignment does not support opregion v2.0 with an extended VBT region\n"); - return -EINVAL; - } - - if (rvda != size) { - memunmap(base); - pci_err(vdev->pdev, - "Extended VBT does not follow opregion on version 0x%04x\n", - version); - return -EINVAL; - } - - /* region size for opregion v2.0+: opregion and VBT size. */ size += rvds; - } - } - if (size != OPREGION_SIZE) { - memunmap(base); - base = memremap(addr, size, MEMREMAP_WB); - if (!base) - return -ENOMEM; + /* + * Extended VBT location by RVDA: + * Absolute physical addr for 2.0. + * Relative addr to OpRegion header for 2.1+. + */ + if (version == 0x0200) + addr = rvda; + else + addr += rvda; + + opregionvbt->vbt_ex = memremap(addr, rvds, MEMREMAP_WB); + if (!opregionvbt->vbt_ex) { + memunmap(opregionvbt->opregion); + kfree(opregionvbt); + return -ENOMEM; + } + } } ret = vfio_pci_register_dev_region(vdev, PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, - VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, - &vfio_pci_igd_regops, size, VFIO_REGION_INFO_FLAG_READ, base); + VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &vfio_pci_igd_regops, + size, VFIO_REGION_INFO_FLAG_READ, opregionvbt); if (ret) { - memunmap(base); + if (opregionvbt->vbt_ex) + memunmap(opregionvbt->vbt_ex); + + memunmap(opregionvbt->opregion); + kfree(opregionvbt); return ret; } From 63b150fde7a2549e8bf7cb0fca7e9cfb9cad50d7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 15 Oct 2021 08:40:50 -0300 Subject: [PATCH 19/27] vfio: Delete vfio_get/put_group from vfio_iommu_group_notifier() iommu_group_register_notifier()/iommu_group_unregister_notifier() are built using a blocking_notifier_chain which integrates a rwsem. The notifier function cannot be running outside its registration. When considering how the notifier function interacts with create/destroy of the group there are two fringe cases, the notifier starts before list_add(&vfio.group_list) and the notifier runs after the kref becomes 0. Prior to vfio_create_group() unlocking and returning we have container_users == 0 device_list == empty And this cannot change until the mutex is unlocked. After the kref goes to zero we must also have container_users == 0 device_list == empty Both are required because they are balanced operations and a 0 kref means some caller became unbalanced. Add the missing assertion that container_users must be zero as well. These two facts are important because when checking each operation we see: - IOMMU_GROUP_NOTIFY_ADD_DEVICE Empty device_list avoids the WARN_ON in vfio_group_nb_add_dev() 0 container_users ends the call - IOMMU_GROUP_NOTIFY_BOUND_DRIVER 0 container_users ends the call Finally, we have IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER, which only deletes items from the unbound list. During creation this list is empty, during kref == 0 nothing can read this list, and it will be freed soon. Since the vfio_group_release() doesn't hold the appropriate lock to manipulate the unbound_list and could race with the notifier, move the cleanup to directly before the kfree. This allows deleting all of the deferred group put code. Reviewed-by: Kevin Tian Reviewed-by: Liu Yi L Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v3-2fdfe4ca2cc6+18c-vfio_group_cdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 100 +++++++------------------------------------- 1 file changed, 15 insertions(+), 85 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 08b27b64f0f9..4ce7e9fe43af 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -324,12 +324,16 @@ static void vfio_container_put(struct vfio_container *container) static void vfio_group_unlock_and_free(struct vfio_group *group) { + struct vfio_unbound_dev *unbound, *tmp; + mutex_unlock(&vfio.group_lock); - /* - * Unregister outside of lock. A spurious callback is harmless now - * that the group is no longer in vfio.group_list. - */ iommu_group_unregister_notifier(group->iommu_group, &group->nb); + + list_for_each_entry_safe(unbound, tmp, + &group->unbound_list, unbound_next) { + list_del(&unbound->unbound_next); + kfree(unbound); + } kfree(group); } @@ -360,14 +364,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); group->nb.notifier_call = vfio_iommu_group_notifier; - - /* - * blocking notifiers acquire a rwsem around registering and hold - * it around callback. Therefore, need to register outside of - * vfio.group_lock to avoid A-B/B-A contention. Our callback won't - * do anything unless it can find the group in vfio.group_list, so - * no harm in registering early. - */ ret = iommu_group_register_notifier(iommu_group, &group->nb); if (ret) { kfree(group); @@ -415,18 +411,18 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, static void vfio_group_release(struct kref *kref) { struct vfio_group *group = container_of(kref, struct vfio_group, kref); - struct vfio_unbound_dev *unbound, *tmp; struct iommu_group *iommu_group = group->iommu_group; + /* + * These data structures all have paired operations that can only be + * undone when the caller holds a live reference on the group. Since all + * pairs must be undone these WARN_ON's indicate some caller did not + * properly hold the group reference. + */ WARN_ON(!list_empty(&group->device_list)); + WARN_ON(atomic_read(&group->container_users)); WARN_ON(group->notifier.head); - list_for_each_entry_safe(unbound, tmp, - &group->unbound_list, unbound_next) { - list_del(&unbound->unbound_next); - kfree(unbound); - } - device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); list_del(&group->vfio_next); vfio_free_group_minor(group->minor); @@ -439,61 +435,12 @@ static void vfio_group_put(struct vfio_group *group) kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock); } -struct vfio_group_put_work { - struct work_struct work; - struct vfio_group *group; -}; - -static void vfio_group_put_bg(struct work_struct *work) -{ - struct vfio_group_put_work *do_work; - - do_work = container_of(work, struct vfio_group_put_work, work); - - vfio_group_put(do_work->group); - kfree(do_work); -} - -static void vfio_group_schedule_put(struct vfio_group *group) -{ - struct vfio_group_put_work *do_work; - - do_work = kmalloc(sizeof(*do_work), GFP_KERNEL); - if (WARN_ON(!do_work)) - return; - - INIT_WORK(&do_work->work, vfio_group_put_bg); - do_work->group = group; - schedule_work(&do_work->work); -} - /* Assume group_lock or group reference is held */ static void vfio_group_get(struct vfio_group *group) { kref_get(&group->kref); } -/* - * Not really a try as we will sleep for mutex, but we need to make - * sure the group pointer is valid under lock and get a reference. - */ -static struct vfio_group *vfio_group_try_get(struct vfio_group *group) -{ - struct vfio_group *target = group; - - mutex_lock(&vfio.group_lock); - list_for_each_entry(group, &vfio.group_list, vfio_next) { - if (group == target) { - vfio_group_get(group); - mutex_unlock(&vfio.group_lock); - return group; - } - } - mutex_unlock(&vfio.group_lock); - - return NULL; -} - static struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group) { @@ -691,14 +638,6 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, struct device *dev = data; struct vfio_unbound_dev *unbound; - /* - * Need to go through a group_lock lookup to get a reference or we - * risk racing a group being removed. Ignore spurious notifies. - */ - group = vfio_group_try_get(group); - if (!group) - return NOTIFY_OK; - switch (action) { case IOMMU_GROUP_NOTIFY_ADD_DEVICE: vfio_group_nb_add_dev(group, dev); @@ -749,15 +688,6 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, mutex_unlock(&group->unbound_lock); break; } - - /* - * If we're the last reference to the group, the group will be - * released, which includes unregistering the iommu group notifier. - * We hold a read-lock on that notifier list, unregistering needs - * a write-lock... deadlock. Release our reference asynchronously - * to avoid that situation. - */ - vfio_group_schedule_put(group); return NOTIFY_OK; } From 1ceabade1df78c420e30efd4d39a48ceb46487bc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 15 Oct 2021 08:40:51 -0300 Subject: [PATCH 20/27] vfio: Do not open code the group list search in vfio_create_group() Split vfio_group_get_from_iommu() into __vfio_group_get_from_iommu() so that vfio_create_group() can call it to consolidate this duplicated code. Reviewed-by: Liu Yi L Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v3-2fdfe4ca2cc6+18c-vfio_group_cdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 55 ++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 4ce7e9fe43af..513fb5a4c102 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -340,10 +340,35 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) /** * Group objects - create, release, get, put, search */ +static struct vfio_group * +__vfio_group_get_from_iommu(struct iommu_group *iommu_group) +{ + struct vfio_group *group; + + list_for_each_entry(group, &vfio.group_list, vfio_next) { + if (group->iommu_group == iommu_group) { + vfio_group_get(group); + return group; + } + } + return NULL; +} + +static struct vfio_group * +vfio_group_get_from_iommu(struct iommu_group *iommu_group) +{ + struct vfio_group *group; + + mutex_lock(&vfio.group_lock); + group = __vfio_group_get_from_iommu(iommu_group); + mutex_unlock(&vfio.group_lock); + return group; +} + static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, enum vfio_group_type type) { - struct vfio_group *group, *tmp; + struct vfio_group *group, *existing_group; struct device *dev; int ret, minor; @@ -373,12 +398,10 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, mutex_lock(&vfio.group_lock); /* Did we race creating this group? */ - list_for_each_entry(tmp, &vfio.group_list, vfio_next) { - if (tmp->iommu_group == iommu_group) { - vfio_group_get(tmp); - vfio_group_unlock_and_free(group); - return tmp; - } + existing_group = __vfio_group_get_from_iommu(iommu_group); + if (existing_group) { + vfio_group_unlock_and_free(group); + return existing_group; } minor = vfio_alloc_group_minor(group); @@ -441,24 +464,6 @@ static void vfio_group_get(struct vfio_group *group) kref_get(&group->kref); } -static -struct vfio_group *vfio_group_get_from_iommu(struct iommu_group *iommu_group) -{ - struct vfio_group *group; - - mutex_lock(&vfio.group_lock); - list_for_each_entry(group, &vfio.group_list, vfio_next) { - if (group->iommu_group == iommu_group) { - vfio_group_get(group); - mutex_unlock(&vfio.group_lock); - return group; - } - } - mutex_unlock(&vfio.group_lock); - - return NULL; -} - static struct vfio_group *vfio_group_get_from_minor(int minor) { struct vfio_group *group; From 325a31c920309d2225311573a05c2f4dd402b2ed Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 15 Oct 2021 08:40:52 -0300 Subject: [PATCH 21/27] vfio: Don't leak a group reference if the group already exists If vfio_create_group() searches the group list and returns an already existing group it does not put back the iommu_group reference that the caller passed in. Change the semantic of vfio_create_group() to not move the reference in from the caller, but instead obtain a new reference inside and leave the caller's reference alone. The two callers must now call iommu_group_put(). This is an unlikely race as the only caller that could hit it has already searched the group list before attempting to create the group. Fixes: cba3345cc494 ("vfio: VFIO core") Reviewed-by: Christoph Hellwig Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v3-2fdfe4ca2cc6+18c-vfio_group_cdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 513fb5a4c102..4abb2e5e1965 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -334,6 +334,7 @@ static void vfio_group_unlock_and_free(struct vfio_group *group) list_del(&unbound->unbound_next); kfree(unbound); } + iommu_group_put(group->iommu_group); kfree(group); } @@ -385,12 +386,15 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, atomic_set(&group->opened, 0); init_waitqueue_head(&group->container_q); group->iommu_group = iommu_group; + /* put in vfio_group_unlock_and_free() */ + iommu_group_ref_get(iommu_group); group->type = type; BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); group->nb.notifier_call = vfio_iommu_group_notifier; ret = iommu_group_register_notifier(iommu_group, &group->nb); if (ret) { + iommu_group_put(iommu_group); kfree(group); return ERR_PTR(ret); } @@ -426,7 +430,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, list_add(&group->vfio_next, &vfio.group_list); mutex_unlock(&vfio.group_lock); - return group; } @@ -434,7 +437,6 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, static void vfio_group_release(struct kref *kref) { struct vfio_group *group = container_of(kref, struct vfio_group, kref); - struct iommu_group *iommu_group = group->iommu_group; /* * These data structures all have paired operations that can only be @@ -450,7 +452,6 @@ static void vfio_group_release(struct kref *kref) list_del(&group->vfio_next); vfio_free_group_minor(group->minor); vfio_group_unlock_and_free(group); - iommu_group_put(iommu_group); } static void vfio_group_put(struct vfio_group *group) @@ -735,7 +736,7 @@ static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, ret = PTR_ERR(group); goto out_remove_device; } - + iommu_group_put(iommu_group); return group; out_remove_device: @@ -770,18 +771,11 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) if (!iommu_group) return ERR_PTR(-EINVAL); - /* a found vfio_group already holds a reference to the iommu_group */ group = vfio_group_get_from_iommu(iommu_group); - if (group) - goto out_put; + if (!group) + group = vfio_create_group(iommu_group, VFIO_IOMMU); - /* a newly created vfio_group keeps the reference. */ - group = vfio_create_group(iommu_group, VFIO_IOMMU); - if (IS_ERR(group)) - goto out_put; - return group; - -out_put: + /* The vfio_group holds a reference to the iommu_group */ iommu_group_put(iommu_group); return group; } From 2b678aa2f0990a25e15cdef66256a131566ecd2e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 15 Oct 2021 08:40:53 -0300 Subject: [PATCH 22/27] vfio: Use a refcount_t instead of a kref in the vfio_group The next patch adds a struct device to the struct vfio_group, and it is confusing/bad practice to have two krefs in the same struct. This kref is controlling the period when the vfio_group is registered in sysfs, and visible in the internal lookup. Switch it to a refcount_t instead. The refcount_dec_and_mutex_lock() is still required because we need atomicity of the list searches and sysfs presence. Reviewed-by: Liu Yi L Reviewed-by: Kevin Tian Reviewed-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v3-2fdfe4ca2cc6+18c-vfio_group_cdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 4abb2e5e1965..e313fa030b91 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -69,7 +69,7 @@ struct vfio_unbound_dev { }; struct vfio_group { - struct kref kref; + refcount_t users; int minor; atomic_t container_users; struct iommu_group *iommu_group; @@ -377,7 +377,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, if (!group) return ERR_PTR(-ENOMEM); - kref_init(&group->kref); + refcount_set(&group->users, 1); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); INIT_LIST_HEAD(&group->unbound_list); @@ -433,10 +433,10 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, return group; } -/* called with vfio.group_lock held */ -static void vfio_group_release(struct kref *kref) +static void vfio_group_put(struct vfio_group *group) { - struct vfio_group *group = container_of(kref, struct vfio_group, kref); + if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock)) + return; /* * These data structures all have paired operations that can only be @@ -454,15 +454,9 @@ static void vfio_group_release(struct kref *kref) vfio_group_unlock_and_free(group); } -static void vfio_group_put(struct vfio_group *group) -{ - kref_put_mutex(&group->kref, vfio_group_release, &vfio.group_lock); -} - -/* Assume group_lock or group reference is held */ static void vfio_group_get(struct vfio_group *group) { - kref_get(&group->kref); + refcount_inc(&group->users); } static struct vfio_group *vfio_group_get_from_minor(int minor) @@ -1657,6 +1651,9 @@ struct vfio_group *vfio_group_get_external_user(struct file *filep) if (ret) return ERR_PTR(ret); + /* + * Since the caller holds the fget on the file group->users must be >= 1 + */ vfio_group_get(group); return group; From 9cef73918e15d2284e71022291a8a07901e80bad Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 15 Oct 2021 08:40:54 -0300 Subject: [PATCH 23/27] vfio: Use cdev_device_add() instead of device_create() Modernize how vfio is creating the group char dev and sysfs presence. These days drivers with state should use cdev_device_add() and cdev_device_del() to manage the cdev and sysfs lifetime. This API requires the driver to put the struct device and struct cdev inside its state struct (vfio_group), and then use the usual device_initialize()/cdev_device_add()/cdev_device_del() sequence. Split the code to make this possible: - vfio_group_alloc()/vfio_group_release() are pair'd functions to alloc/free the vfio_group. release is done under the struct device kref. - vfio_create_group()/vfio_group_put() are pairs that manage the sysfs/cdev lifetime. Once the uses count is zero the vfio group's userspace presence is destroyed. - The IDR is replaced with an IDA. container_of(inode->i_cdev) is used to get back to the vfio_group during fops open. The IDA assigns unique minor numbers. Reviewed-by: Christoph Hellwig Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/5-v3-2fdfe4ca2cc6+18c-vfio_group_cdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 193 +++++++++++++++++++++----------------------- 1 file changed, 92 insertions(+), 101 deletions(-) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index e313fa030b91..82fb75464f92 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -43,9 +43,8 @@ static struct vfio { struct list_head iommu_drivers_list; struct mutex iommu_drivers_lock; struct list_head group_list; - struct idr group_idr; - struct mutex group_lock; - struct cdev group_cdev; + struct mutex group_lock; /* locks group_list */ + struct ida group_ida; dev_t group_devt; } vfio; @@ -69,14 +68,14 @@ struct vfio_unbound_dev { }; struct vfio_group { + struct device dev; + struct cdev cdev; refcount_t users; - int minor; atomic_t container_users; struct iommu_group *iommu_group; struct vfio_container *container; struct list_head device_list; struct mutex device_lock; - struct device *dev; struct notifier_block nb; struct list_head vfio_next; struct list_head container_next; @@ -98,6 +97,7 @@ MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. Thi #endif static DEFINE_XARRAY(vfio_device_set_xa); +static const struct file_operations vfio_group_fops; int vfio_assign_device_set(struct vfio_device *device, void *set_id) { @@ -281,19 +281,6 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops) } EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver); -/** - * Group minor allocation/free - both called with vfio.group_lock held - */ -static int vfio_alloc_group_minor(struct vfio_group *group) -{ - return idr_alloc(&vfio.group_idr, group, 0, MINORMASK + 1, GFP_KERNEL); -} - -static void vfio_free_group_minor(int minor) -{ - idr_remove(&vfio.group_idr, minor); -} - static int vfio_iommu_group_notifier(struct notifier_block *nb, unsigned long action, void *data); static void vfio_group_get(struct vfio_group *group); @@ -322,22 +309,6 @@ static void vfio_container_put(struct vfio_container *container) kref_put(&container->kref, vfio_container_release); } -static void vfio_group_unlock_and_free(struct vfio_group *group) -{ - struct vfio_unbound_dev *unbound, *tmp; - - mutex_unlock(&vfio.group_lock); - iommu_group_unregister_notifier(group->iommu_group, &group->nb); - - list_for_each_entry_safe(unbound, tmp, - &group->unbound_list, unbound_next) { - list_del(&unbound->unbound_next); - kfree(unbound); - } - iommu_group_put(group->iommu_group); - kfree(group); -} - /** * Group objects - create, release, get, put, search */ @@ -366,71 +337,112 @@ vfio_group_get_from_iommu(struct iommu_group *iommu_group) return group; } -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - enum vfio_group_type type) +static void vfio_group_release(struct device *dev) { - struct vfio_group *group, *existing_group; - struct device *dev; - int ret, minor; + struct vfio_group *group = container_of(dev, struct vfio_group, dev); + struct vfio_unbound_dev *unbound, *tmp; + + list_for_each_entry_safe(unbound, tmp, + &group->unbound_list, unbound_next) { + list_del(&unbound->unbound_next); + kfree(unbound); + } + + mutex_destroy(&group->device_lock); + mutex_destroy(&group->unbound_lock); + iommu_group_put(group->iommu_group); + ida_free(&vfio.group_ida, MINOR(group->dev.devt)); + kfree(group); +} + +static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + int minor; group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return ERR_PTR(-ENOMEM); + minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); + if (minor < 0) { + kfree(group); + return ERR_PTR(minor); + } + + device_initialize(&group->dev); + group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); + group->dev.class = vfio.class; + group->dev.release = vfio_group_release; + cdev_init(&group->cdev, &vfio_group_fops); + group->cdev.owner = THIS_MODULE; + refcount_set(&group->users, 1); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); INIT_LIST_HEAD(&group->unbound_list); mutex_init(&group->unbound_lock); - atomic_set(&group->container_users, 0); - atomic_set(&group->opened, 0); init_waitqueue_head(&group->container_q); group->iommu_group = iommu_group; - /* put in vfio_group_unlock_and_free() */ + /* put in vfio_group_release() */ iommu_group_ref_get(iommu_group); group->type = type; BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); + return group; +} + +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + struct vfio_group *ret; + int err; + + group = vfio_group_alloc(iommu_group, type); + if (IS_ERR(group)) + return group; + + err = dev_set_name(&group->dev, "%s%d", + group->type == VFIO_NO_IOMMU ? "noiommu-" : "", + iommu_group_id(iommu_group)); + if (err) { + ret = ERR_PTR(err); + goto err_put; + } + group->nb.notifier_call = vfio_iommu_group_notifier; - ret = iommu_group_register_notifier(iommu_group, &group->nb); - if (ret) { - iommu_group_put(iommu_group); - kfree(group); - return ERR_PTR(ret); + err = iommu_group_register_notifier(iommu_group, &group->nb); + if (err) { + ret = ERR_PTR(err); + goto err_put; } mutex_lock(&vfio.group_lock); /* Did we race creating this group? */ - existing_group = __vfio_group_get_from_iommu(iommu_group); - if (existing_group) { - vfio_group_unlock_and_free(group); - return existing_group; - } + ret = __vfio_group_get_from_iommu(iommu_group); + if (ret) + goto err_unlock; - minor = vfio_alloc_group_minor(group); - if (minor < 0) { - vfio_group_unlock_and_free(group); - return ERR_PTR(minor); + err = cdev_device_add(&group->cdev, &group->dev); + if (err) { + ret = ERR_PTR(err); + goto err_unlock; } - dev = device_create(vfio.class, NULL, - MKDEV(MAJOR(vfio.group_devt), minor), group, "%s%d", - group->type == VFIO_NO_IOMMU ? "noiommu-" : "", - iommu_group_id(iommu_group)); - if (IS_ERR(dev)) { - vfio_free_group_minor(minor); - vfio_group_unlock_and_free(group); - return ERR_CAST(dev); - } - - group->minor = minor; - group->dev = dev; - list_add(&group->vfio_next, &vfio.group_list); mutex_unlock(&vfio.group_lock); return group; + +err_unlock: + mutex_unlock(&vfio.group_lock); + iommu_group_unregister_notifier(group->iommu_group, &group->nb); +err_put: + put_device(&group->dev); + return ret; } static void vfio_group_put(struct vfio_group *group) @@ -448,10 +460,12 @@ static void vfio_group_put(struct vfio_group *group) WARN_ON(atomic_read(&group->container_users)); WARN_ON(group->notifier.head); - device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); list_del(&group->vfio_next); - vfio_free_group_minor(group->minor); - vfio_group_unlock_and_free(group); + cdev_device_del(&group->cdev, &group->dev); + mutex_unlock(&vfio.group_lock); + + iommu_group_unregister_notifier(group->iommu_group, &group->nb); + put_device(&group->dev); } static void vfio_group_get(struct vfio_group *group) @@ -459,22 +473,6 @@ static void vfio_group_get(struct vfio_group *group) refcount_inc(&group->users); } -static struct vfio_group *vfio_group_get_from_minor(int minor) -{ - struct vfio_group *group; - - mutex_lock(&vfio.group_lock); - group = idr_find(&vfio.group_idr, minor); - if (!group) { - mutex_unlock(&vfio.group_lock); - return NULL; - } - vfio_group_get(group); - mutex_unlock(&vfio.group_lock); - - return group; -} - static struct vfio_group *vfio_group_get_from_dev(struct device *dev) { struct iommu_group *iommu_group; @@ -1479,11 +1477,12 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, static int vfio_group_fops_open(struct inode *inode, struct file *filep) { - struct vfio_group *group; + struct vfio_group *group = + container_of(inode->i_cdev, struct vfio_group, cdev); int opened; - group = vfio_group_get_from_minor(iminor(inode)); - if (!group) + /* users can be zero if this races with vfio_group_put() */ + if (!refcount_inc_not_zero(&group->users)) return -ENODEV; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { @@ -2293,7 +2292,7 @@ static int __init vfio_init(void) { int ret; - idr_init(&vfio.group_idr); + ida_init(&vfio.group_ida); mutex_init(&vfio.group_lock); mutex_init(&vfio.iommu_drivers_lock); INIT_LIST_HEAD(&vfio.group_list); @@ -2318,11 +2317,6 @@ static int __init vfio_init(void) if (ret) goto err_alloc_chrdev; - cdev_init(&vfio.group_cdev, &vfio_group_fops); - ret = cdev_add(&vfio.group_cdev, vfio.group_devt, MINORMASK + 1); - if (ret) - goto err_cdev_add; - pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); #ifdef CONFIG_VFIO_NOIOMMU @@ -2330,8 +2324,6 @@ static int __init vfio_init(void) #endif return 0; -err_cdev_add: - unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); err_alloc_chrdev: class_destroy(vfio.class); vfio.class = NULL; @@ -2347,8 +2339,7 @@ static void __exit vfio_cleanup(void) #ifdef CONFIG_VFIO_NOIOMMU vfio_unregister_iommu_driver(&vfio_noiommu_ops); #endif - idr_destroy(&vfio.group_idr); - cdev_del(&vfio.group_cdev); + ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); class_destroy(vfio.class); vfio.class = NULL; From d0a9329d460cbc2f8150da520b1b75e397bbef9f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 26 Oct 2021 14:57:30 -0300 Subject: [PATCH 24/27] vfio/ccw: Remove unneeded GFP_DMA Since the ccw_io_region was split out of the private the allocation no longer needs the GFP_DMA. Remove it. Reported-by: Christoph Hellwig Fixes: c98e16b2fa12 ("s390/cio: Convert ccw_io_region to pointer") Reviewed-by: Matthew Rosato Reviewed-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1-v4-cea4f5bd2c00+b52-ccw_mdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 76099bcb765b..371558ec9204 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -161,7 +161,7 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) return -ENODEV; } - private = kzalloc(sizeof(*private), GFP_KERNEL | GFP_DMA); + private = kzalloc(sizeof(*private), GFP_KERNEL); if (!private) return -ENOMEM; From 0972c7dddf716a781dbe9abf4d042264b679ab53 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 26 Oct 2021 14:57:31 -0300 Subject: [PATCH 25/27] vfio/ccw: Use functions for alloc/free of the vfio_ccw_private Makes the code easier to understand what is memory lifecycle and what is other stuff. Reviewed-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/2-v4-cea4f5bd2c00+b52-ccw_mdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 137 ++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 59 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index 371558ec9204..e32678a71644 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -137,16 +137,80 @@ static void vfio_ccw_sch_irq(struct subchannel *sch) vfio_ccw_fsm_event(private, VFIO_CCW_EVENT_INTERRUPT); } -static void vfio_ccw_free_regions(struct vfio_ccw_private *private) +static struct vfio_ccw_private *vfio_ccw_alloc_private(struct subchannel *sch) { - if (private->crw_region) - kmem_cache_free(vfio_ccw_crw_region, private->crw_region); - if (private->schib_region) - kmem_cache_free(vfio_ccw_schib_region, private->schib_region); - if (private->cmd_region) - kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); - if (private->io_region) - kmem_cache_free(vfio_ccw_io_region, private->io_region); + struct vfio_ccw_private *private; + + private = kzalloc(sizeof(*private), GFP_KERNEL); + if (!private) + return ERR_PTR(-ENOMEM); + + private->sch = sch; + mutex_init(&private->io_mutex); + private->state = VFIO_CCW_STATE_NOT_OPER; + INIT_LIST_HEAD(&private->crw); + INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); + INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); + atomic_set(&private->avail, 1); + + private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), + GFP_KERNEL); + if (!private->cp.guest_cp) + goto out_free_private; + + private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, + GFP_KERNEL | GFP_DMA); + if (!private->io_region) + goto out_free_cp; + + private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, + GFP_KERNEL | GFP_DMA); + if (!private->cmd_region) + goto out_free_io; + + private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, + GFP_KERNEL | GFP_DMA); + + if (!private->schib_region) + goto out_free_cmd; + + private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, + GFP_KERNEL | GFP_DMA); + + if (!private->crw_region) + goto out_free_schib; + return private; + +out_free_schib: + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); +out_free_cmd: + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); +out_free_io: + kmem_cache_free(vfio_ccw_io_region, private->io_region); +out_free_cp: + kfree(private->cp.guest_cp); +out_free_private: + mutex_destroy(&private->io_mutex); + kfree(private); + return ERR_PTR(-ENOMEM); +} + +static void vfio_ccw_free_private(struct vfio_ccw_private *private) +{ + struct vfio_ccw_crw *crw, *temp; + + list_for_each_entry_safe(crw, temp, &private->crw, next) { + list_del(&crw->next); + kfree(crw); + } + + kmem_cache_free(vfio_ccw_crw_region, private->crw_region); + kmem_cache_free(vfio_ccw_schib_region, private->schib_region); + kmem_cache_free(vfio_ccw_cmd_region, private->cmd_region); + kmem_cache_free(vfio_ccw_io_region, private->io_region); + kfree(private->cp.guest_cp); + mutex_destroy(&private->io_mutex); + kfree(private); } static int vfio_ccw_sch_probe(struct subchannel *sch) @@ -161,53 +225,19 @@ static int vfio_ccw_sch_probe(struct subchannel *sch) return -ENODEV; } - private = kzalloc(sizeof(*private), GFP_KERNEL); - if (!private) - return -ENOMEM; + private = vfio_ccw_alloc_private(sch); + if (IS_ERR(private)) + return PTR_ERR(private); - private->cp.guest_cp = kcalloc(CCWCHAIN_LEN_MAX, sizeof(struct ccw1), - GFP_KERNEL); - if (!private->cp.guest_cp) - goto out_free; - - private->io_region = kmem_cache_zalloc(vfio_ccw_io_region, - GFP_KERNEL | GFP_DMA); - if (!private->io_region) - goto out_free; - - private->cmd_region = kmem_cache_zalloc(vfio_ccw_cmd_region, - GFP_KERNEL | GFP_DMA); - if (!private->cmd_region) - goto out_free; - - private->schib_region = kmem_cache_zalloc(vfio_ccw_schib_region, - GFP_KERNEL | GFP_DMA); - - if (!private->schib_region) - goto out_free; - - private->crw_region = kmem_cache_zalloc(vfio_ccw_crw_region, - GFP_KERNEL | GFP_DMA); - - if (!private->crw_region) - goto out_free; - - private->sch = sch; dev_set_drvdata(&sch->dev, private); - mutex_init(&private->io_mutex); spin_lock_irq(sch->lock); - private->state = VFIO_CCW_STATE_NOT_OPER; sch->isc = VFIO_CCW_ISC; ret = cio_enable_subchannel(sch, (u32)(unsigned long)sch); spin_unlock_irq(sch->lock); if (ret) goto out_free; - INIT_LIST_HEAD(&private->crw); - INIT_WORK(&private->io_work, vfio_ccw_sch_io_todo); - INIT_WORK(&private->crw_work, vfio_ccw_crw_todo); - atomic_set(&private->avail, 1); private->state = VFIO_CCW_STATE_STANDBY; ret = vfio_ccw_mdev_reg(sch); @@ -228,31 +258,20 @@ out_disable: cio_disable_subchannel(sch); out_free: dev_set_drvdata(&sch->dev, NULL); - vfio_ccw_free_regions(private); - kfree(private->cp.guest_cp); - kfree(private); + vfio_ccw_free_private(private); return ret; } static void vfio_ccw_sch_remove(struct subchannel *sch) { struct vfio_ccw_private *private = dev_get_drvdata(&sch->dev); - struct vfio_ccw_crw *crw, *temp; vfio_ccw_sch_quiesce(sch); - - list_for_each_entry_safe(crw, temp, &private->crw, next) { - list_del(&crw->next); - kfree(crw); - } - vfio_ccw_mdev_unreg(sch); dev_set_drvdata(&sch->dev, NULL); - vfio_ccw_free_regions(private); - kfree(private->cp.guest_cp); - kfree(private); + vfio_ccw_free_private(private); VFIO_CCW_MSG_EVENT(4, "unbound from subchannel %x.%x.%04x\n", sch->schid.cssid, sch->schid.ssid, From 39b6ee011f341526308577847f1002be5d1e0a6e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 26 Oct 2021 14:57:32 -0300 Subject: [PATCH 26/27] vfio/ccw: Pass vfio_ccw_private not mdev_device to various functions mdev_device should only be used in functions assigned to ops callbacks, interior functions should use the struct vfio_ccw_private instead of repeatedly trying to get it from the mdev. Reviewed-by: Christoph Hellwig Reviewed-by: Cornelia Huck Reviewed-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/3-v4-cea4f5bd2c00+b52-ccw_mdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_ops.c | 37 +++++++++++++-------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 7f540ad0b568..1edbea9de0ec 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -17,13 +17,11 @@ #include "vfio_ccw_private.h" -static int vfio_ccw_mdev_reset(struct mdev_device *mdev) +static int vfio_ccw_mdev_reset(struct vfio_ccw_private *private) { - struct vfio_ccw_private *private; struct subchannel *sch; int ret; - private = dev_get_drvdata(mdev_parent_dev(mdev)); sch = private->sch; /* * TODO: @@ -61,7 +59,7 @@ static int vfio_ccw_mdev_notifier(struct notifier_block *nb, if (!cp_iova_pinned(&private->cp, unmap->iova)) return NOTIFY_OK; - if (vfio_ccw_mdev_reset(private->mdev)) + if (vfio_ccw_mdev_reset(private)) return NOTIFY_BAD; cp_free(&private->cp); @@ -201,7 +199,7 @@ static void vfio_ccw_mdev_close_device(struct mdev_device *mdev) if ((private->state != VFIO_CCW_STATE_NOT_OPER) && (private->state != VFIO_CCW_STATE_STANDBY)) { - if (!vfio_ccw_mdev_reset(mdev)) + if (!vfio_ccw_mdev_reset(private)) private->state = VFIO_CCW_STATE_STANDBY; /* The state will be NOT_OPER on error. */ } @@ -311,12 +309,9 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, return -EINVAL; } -static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info, - struct mdev_device *mdev) +static int vfio_ccw_mdev_get_device_info(struct vfio_ccw_private *private, + struct vfio_device_info *info) { - struct vfio_ccw_private *private; - - private = dev_get_drvdata(mdev_parent_dev(mdev)); info->flags = VFIO_DEVICE_FLAGS_CCW | VFIO_DEVICE_FLAGS_RESET; info->num_regions = VFIO_CCW_NUM_REGIONS + private->num_regions; info->num_irqs = VFIO_CCW_NUM_IRQS; @@ -324,14 +319,12 @@ static int vfio_ccw_mdev_get_device_info(struct vfio_device_info *info, return 0; } -static int vfio_ccw_mdev_get_region_info(struct vfio_region_info *info, - struct mdev_device *mdev, +static int vfio_ccw_mdev_get_region_info(struct vfio_ccw_private *private, + struct vfio_region_info *info, unsigned long arg) { - struct vfio_ccw_private *private; int i; - private = dev_get_drvdata(mdev_parent_dev(mdev)); switch (info->index) { case VFIO_CCW_CONFIG_REGION_INDEX: info->offset = 0; @@ -406,19 +399,16 @@ static int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info) return 0; } -static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev, +static int vfio_ccw_mdev_set_irqs(struct vfio_ccw_private *private, uint32_t flags, uint32_t index, void __user *data) { - struct vfio_ccw_private *private; struct eventfd_ctx **ctx; if (!(flags & VFIO_IRQ_SET_ACTION_TRIGGER)) return -EINVAL; - private = dev_get_drvdata(mdev_parent_dev(mdev)); - switch (index) { case VFIO_CCW_IO_IRQ_INDEX: ctx = &private->io_trigger; @@ -524,6 +514,8 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, unsigned int cmd, unsigned long arg) { + struct vfio_ccw_private *private = + dev_get_drvdata(mdev_parent_dev(mdev)); int ret = 0; unsigned long minsz; @@ -540,7 +532,7 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, if (info.argsz < minsz) return -EINVAL; - ret = vfio_ccw_mdev_get_device_info(&info, mdev); + ret = vfio_ccw_mdev_get_device_info(private, &info); if (ret) return ret; @@ -558,7 +550,7 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, if (info.argsz < minsz) return -EINVAL; - ret = vfio_ccw_mdev_get_region_info(&info, mdev, arg); + ret = vfio_ccw_mdev_get_region_info(private, &info, arg); if (ret) return ret; @@ -603,10 +595,11 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, return ret; data = (void __user *)(arg + minsz); - return vfio_ccw_mdev_set_irqs(mdev, hdr.flags, hdr.index, data); + return vfio_ccw_mdev_set_irqs(private, hdr.flags, hdr.index, + data); } case VFIO_DEVICE_RESET: - return vfio_ccw_mdev_reset(mdev); + return vfio_ccw_mdev_reset(private); default: return -ENOTTY; } From 3bf1311f351ef289f2aee79b86bcece2039fa611 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 26 Oct 2021 14:57:33 -0300 Subject: [PATCH 27/27] vfio/ccw: Convert to use vfio_register_emulated_iommu_dev() This is a more complicated conversion because vfio_ccw is sharing the vfio_device between both the mdev_device, its vfio_device and the css_driver. The mdev is a singleton, and the reason for this sharing is so the extra css_driver function callbacks to be delivered to the vfio_device implementation. This keeps things as they are, with the css_driver allocating the singleton, not the mdev_driver. Embed the vfio_device in the vfio_ccw_private and instantiate it as a vfio_device when the mdev probes. The drvdata of both the css_device and the mdev_device point at the private, and container_of is used to get it back from the vfio_device. Reviewed-by: Eric Farman Signed-off-by: Jason Gunthorpe Link: https://lore.kernel.org/r/4-v4-cea4f5bd2c00+b52-ccw_mdev_jgg@nvidia.com Signed-off-by: Alex Williamson --- drivers/s390/cio/vfio_ccw_drv.c | 21 ++++-- drivers/s390/cio/vfio_ccw_ops.c | 107 +++++++++++++++++----------- drivers/s390/cio/vfio_ccw_private.h | 5 ++ 3 files changed, 85 insertions(+), 48 deletions(-) diff --git a/drivers/s390/cio/vfio_ccw_drv.c b/drivers/s390/cio/vfio_ccw_drv.c index e32678a71644..040742777095 100644 --- a/drivers/s390/cio/vfio_ccw_drv.c +++ b/drivers/s390/cio/vfio_ccw_drv.c @@ -468,7 +468,7 @@ static int __init vfio_ccw_sch_init(void) vfio_ccw_work_q = create_singlethread_workqueue("vfio-ccw"); if (!vfio_ccw_work_q) { ret = -ENOMEM; - goto out_err; + goto out_regions; } vfio_ccw_io_region = kmem_cache_create_usercopy("vfio_ccw_io_region", @@ -477,7 +477,7 @@ static int __init vfio_ccw_sch_init(void) sizeof(struct ccw_io_region), NULL); if (!vfio_ccw_io_region) { ret = -ENOMEM; - goto out_err; + goto out_regions; } vfio_ccw_cmd_region = kmem_cache_create_usercopy("vfio_ccw_cmd_region", @@ -486,7 +486,7 @@ static int __init vfio_ccw_sch_init(void) sizeof(struct ccw_cmd_region), NULL); if (!vfio_ccw_cmd_region) { ret = -ENOMEM; - goto out_err; + goto out_regions; } vfio_ccw_schib_region = kmem_cache_create_usercopy("vfio_ccw_schib_region", @@ -496,7 +496,7 @@ static int __init vfio_ccw_sch_init(void) if (!vfio_ccw_schib_region) { ret = -ENOMEM; - goto out_err; + goto out_regions; } vfio_ccw_crw_region = kmem_cache_create_usercopy("vfio_ccw_crw_region", @@ -506,19 +506,25 @@ static int __init vfio_ccw_sch_init(void) if (!vfio_ccw_crw_region) { ret = -ENOMEM; - goto out_err; + goto out_regions; } + ret = mdev_register_driver(&vfio_ccw_mdev_driver); + if (ret) + goto out_regions; + isc_register(VFIO_CCW_ISC); ret = css_driver_register(&vfio_ccw_sch_driver); if (ret) { isc_unregister(VFIO_CCW_ISC); - goto out_err; + goto out_driver; } return ret; -out_err: +out_driver: + mdev_unregister_driver(&vfio_ccw_mdev_driver); +out_regions: vfio_ccw_destroy_regions(); destroy_workqueue(vfio_ccw_work_q); vfio_ccw_debug_exit(); @@ -528,6 +534,7 @@ out_err: static void __exit vfio_ccw_sch_exit(void) { css_driver_unregister(&vfio_ccw_sch_driver); + mdev_unregister_driver(&vfio_ccw_mdev_driver); isc_unregister(VFIO_CCW_ISC); vfio_ccw_destroy_regions(); destroy_workqueue(vfio_ccw_work_q); diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 1edbea9de0ec..d8589afac272 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -17,6 +17,8 @@ #include "vfio_ccw_private.h" +static const struct vfio_device_ops vfio_ccw_dev_ops; + static int vfio_ccw_mdev_reset(struct vfio_ccw_private *private) { struct subchannel *sch; @@ -111,10 +113,10 @@ static struct attribute_group *mdev_type_groups[] = { NULL, }; -static int vfio_ccw_mdev_create(struct mdev_device *mdev) +static int vfio_ccw_mdev_probe(struct mdev_device *mdev) { - struct vfio_ccw_private *private = - dev_get_drvdata(mdev_parent_dev(mdev)); + struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); + int ret; if (private->state == VFIO_CCW_STATE_NOT_OPER) return -ENODEV; @@ -122,6 +124,10 @@ static int vfio_ccw_mdev_create(struct mdev_device *mdev) if (atomic_dec_if_positive(&private->avail) < 0) return -EPERM; + memset(&private->vdev, 0, sizeof(private->vdev)); + vfio_init_group_dev(&private->vdev, &mdev->dev, + &vfio_ccw_dev_ops); + private->mdev = mdev; private->state = VFIO_CCW_STATE_IDLE; @@ -130,19 +136,31 @@ static int vfio_ccw_mdev_create(struct mdev_device *mdev) private->sch->schid.ssid, private->sch->schid.sch_no); + ret = vfio_register_emulated_iommu_dev(&private->vdev); + if (ret) + goto err_atomic; + dev_set_drvdata(&mdev->dev, private); return 0; + +err_atomic: + vfio_uninit_group_dev(&private->vdev); + atomic_inc(&private->avail); + private->mdev = NULL; + private->state = VFIO_CCW_STATE_IDLE; + return ret; } -static int vfio_ccw_mdev_remove(struct mdev_device *mdev) +static void vfio_ccw_mdev_remove(struct mdev_device *mdev) { - struct vfio_ccw_private *private = - dev_get_drvdata(mdev_parent_dev(mdev)); + struct vfio_ccw_private *private = dev_get_drvdata(mdev->dev.parent); VFIO_CCW_MSG_EVENT(2, "mdev %pUl, sch %x.%x.%04x: remove\n", mdev_uuid(mdev), private->sch->schid.cssid, private->sch->schid.ssid, private->sch->schid.sch_no); + vfio_unregister_group_dev(&private->vdev); + if ((private->state != VFIO_CCW_STATE_NOT_OPER) && (private->state != VFIO_CCW_STATE_STANDBY)) { if (!vfio_ccw_sch_quiesce(private->sch)) @@ -150,23 +168,22 @@ static int vfio_ccw_mdev_remove(struct mdev_device *mdev) /* The state will be NOT_OPER on error. */ } + vfio_uninit_group_dev(&private->vdev); cp_free(&private->cp); private->mdev = NULL; atomic_inc(&private->avail); - - return 0; } -static int vfio_ccw_mdev_open_device(struct mdev_device *mdev) +static int vfio_ccw_mdev_open_device(struct vfio_device *vdev) { struct vfio_ccw_private *private = - dev_get_drvdata(mdev_parent_dev(mdev)); + container_of(vdev, struct vfio_ccw_private, vdev); unsigned long events = VFIO_IOMMU_NOTIFY_DMA_UNMAP; int ret; private->nb.notifier_call = vfio_ccw_mdev_notifier; - ret = vfio_register_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + ret = vfio_register_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, &events, &private->nb); if (ret) return ret; @@ -187,15 +204,15 @@ static int vfio_ccw_mdev_open_device(struct mdev_device *mdev) out_unregister: vfio_ccw_unregister_dev_regions(private); - vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, + vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, &private->nb); return ret; } -static void vfio_ccw_mdev_close_device(struct mdev_device *mdev) +static void vfio_ccw_mdev_close_device(struct vfio_device *vdev) { struct vfio_ccw_private *private = - dev_get_drvdata(mdev_parent_dev(mdev)); + container_of(vdev, struct vfio_ccw_private, vdev); if ((private->state != VFIO_CCW_STATE_NOT_OPER) && (private->state != VFIO_CCW_STATE_STANDBY)) { @@ -206,8 +223,7 @@ static void vfio_ccw_mdev_close_device(struct mdev_device *mdev) cp_free(&private->cp); vfio_ccw_unregister_dev_regions(private); - vfio_unregister_notifier(mdev_dev(mdev), VFIO_IOMMU_NOTIFY, - &private->nb); + vfio_unregister_notifier(vdev->dev, VFIO_IOMMU_NOTIFY, &private->nb); } static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private, @@ -231,15 +247,14 @@ static ssize_t vfio_ccw_mdev_read_io_region(struct vfio_ccw_private *private, return ret; } -static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev, +static ssize_t vfio_ccw_mdev_read(struct vfio_device *vdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); unsigned int index = VFIO_CCW_OFFSET_TO_INDEX(*ppos); - struct vfio_ccw_private *private; - - private = dev_get_drvdata(mdev_parent_dev(mdev)); if (index >= VFIO_CCW_NUM_REGIONS + private->num_regions) return -EINVAL; @@ -284,15 +299,14 @@ out_unlock: return ret; } -static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev, +static ssize_t vfio_ccw_mdev_write(struct vfio_device *vdev, const char __user *buf, size_t count, loff_t *ppos) { + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); unsigned int index = VFIO_CCW_OFFSET_TO_INDEX(*ppos); - struct vfio_ccw_private *private; - - private = dev_get_drvdata(mdev_parent_dev(mdev)); if (index >= VFIO_CCW_NUM_REGIONS + private->num_regions) return -EINVAL; @@ -510,12 +524,12 @@ void vfio_ccw_unregister_dev_regions(struct vfio_ccw_private *private) private->region = NULL; } -static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, +static ssize_t vfio_ccw_mdev_ioctl(struct vfio_device *vdev, unsigned int cmd, unsigned long arg) { struct vfio_ccw_private *private = - dev_get_drvdata(mdev_parent_dev(mdev)); + container_of(vdev, struct vfio_ccw_private, vdev); int ret = 0; unsigned long minsz; @@ -606,37 +620,48 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev, } /* Request removal of the device*/ -static void vfio_ccw_mdev_request(struct mdev_device *mdev, unsigned int count) +static void vfio_ccw_mdev_request(struct vfio_device *vdev, unsigned int count) { - struct vfio_ccw_private *private = dev_get_drvdata(mdev_parent_dev(mdev)); - - if (!private) - return; + struct vfio_ccw_private *private = + container_of(vdev, struct vfio_ccw_private, vdev); + struct device *dev = vdev->dev; if (private->req_trigger) { if (!(count % 10)) - dev_notice_ratelimited(mdev_dev(private->mdev), + dev_notice_ratelimited(dev, "Relaying device request to user (#%u)\n", count); eventfd_signal(private->req_trigger, 1); } else if (count == 0) { - dev_notice(mdev_dev(private->mdev), + dev_notice(dev, "No device request channel registered, blocked until released by user\n"); } } +static const struct vfio_device_ops vfio_ccw_dev_ops = { + .open_device = vfio_ccw_mdev_open_device, + .close_device = vfio_ccw_mdev_close_device, + .read = vfio_ccw_mdev_read, + .write = vfio_ccw_mdev_write, + .ioctl = vfio_ccw_mdev_ioctl, + .request = vfio_ccw_mdev_request, +}; + +struct mdev_driver vfio_ccw_mdev_driver = { + .driver = { + .name = "vfio_ccw_mdev", + .owner = THIS_MODULE, + .mod_name = KBUILD_MODNAME, + }, + .probe = vfio_ccw_mdev_probe, + .remove = vfio_ccw_mdev_remove, +}; + static const struct mdev_parent_ops vfio_ccw_mdev_ops = { .owner = THIS_MODULE, + .device_driver = &vfio_ccw_mdev_driver, .supported_type_groups = mdev_type_groups, - .create = vfio_ccw_mdev_create, - .remove = vfio_ccw_mdev_remove, - .open_device = vfio_ccw_mdev_open_device, - .close_device = vfio_ccw_mdev_close_device, - .read = vfio_ccw_mdev_read, - .write = vfio_ccw_mdev_write, - .ioctl = vfio_ccw_mdev_ioctl, - .request = vfio_ccw_mdev_request, }; int vfio_ccw_mdev_reg(struct subchannel *sch) diff --git a/drivers/s390/cio/vfio_ccw_private.h b/drivers/s390/cio/vfio_ccw_private.h index b2c762eb42b9..7272eb788612 100644 --- a/drivers/s390/cio/vfio_ccw_private.h +++ b/drivers/s390/cio/vfio_ccw_private.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -67,6 +68,7 @@ struct vfio_ccw_crw { /** * struct vfio_ccw_private + * @vdev: Embedded VFIO device * @sch: pointer to the subchannel * @state: internal state of the device * @completion: synchronization helper of the I/O completion @@ -90,6 +92,7 @@ struct vfio_ccw_crw { * @crw_work: work for deferral process of CRW handling */ struct vfio_ccw_private { + struct vfio_device vdev; struct subchannel *sch; int state; struct completion *completion; @@ -121,6 +124,8 @@ extern void vfio_ccw_mdev_unreg(struct subchannel *sch); extern int vfio_ccw_sch_quiesce(struct subchannel *sch); +extern struct mdev_driver vfio_ccw_mdev_driver; + /* * States of the device statemachine. */