2019-06-04 08:11:33 +00:00
|
|
|
// SPDX-License-Identifier: GPL-2.0-only
|
2012-07-31 14:16:24 +00:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
|
|
|
* Author: Alex Williamson <alex.williamson@redhat.com>
|
|
|
|
*
|
|
|
|
* Derived from original vfio:
|
|
|
|
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
|
|
|
* Author: Tom Lyon, pugs@cisco.com
|
|
|
|
*/
|
|
|
|
|
2021-08-26 10:39:12 +00:00
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
|
2022-06-22 14:01:34 +00:00
|
|
|
#include <linux/aperture.h>
|
2012-07-31 14:16:24 +00:00
|
|
|
#include <linux/device.h>
|
|
|
|
#include <linux/eventfd.h>
|
2013-09-04 17:28:04 +00:00
|
|
|
#include <linux/file.h>
|
2012-07-31 14:16:24 +00:00
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/iommu.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/notifier.h>
|
|
|
|
#include <linux/pci.h>
|
|
|
|
#include <linux/pm_runtime.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/uaccess.h>
|
2015-04-07 17:14:41 +00:00
|
|
|
#include <linux/vgaarb.h>
|
2018-07-17 17:39:00 +00:00
|
|
|
#include <linux/nospec.h>
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
#include <linux/sched/mm.h>
|
2023-07-18 10:55:40 +00:00
|
|
|
#include <linux/iommufd.h>
|
2022-12-05 15:29:19 +00:00
|
|
|
#if IS_ENABLED(CONFIG_EEH)
|
2022-12-05 15:29:16 +00:00
|
|
|
#include <asm/eeh.h>
|
|
|
|
#endif
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-26 19:34:01 +00:00
|
|
|
#include "vfio_pci_priv.h"
|
2021-08-26 10:39:12 +00:00
|
|
|
|
|
|
|
#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
|
|
|
|
#define DRIVER_DESC "core driver for VFIO based PCI devices"
|
2012-07-31 14:16:24 +00:00
|
|
|
|
|
|
|
static bool nointxmask;
|
2015-04-07 17:14:40 +00:00
|
|
|
static bool disable_vga;
|
2015-04-07 17:14:46 +00:00
|
|
|
static bool disable_idle_d3;
|
|
|
|
|
2022-04-13 13:10:36 +00:00
|
|
|
/* List of PF's that vfio_pci_core_sriov_configure() has been called on */
|
|
|
|
static DEFINE_MUTEX(vfio_pci_sriov_pfs_mutex);
|
|
|
|
static LIST_HEAD(vfio_pci_sriov_pfs);
|
|
|
|
|
2022-08-26 19:34:01 +00:00
|
|
|
struct vfio_pci_dummy_resource {
|
|
|
|
struct resource resource;
|
|
|
|
int index;
|
|
|
|
struct list_head res_next;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct vfio_pci_vf_token {
|
|
|
|
struct mutex lock;
|
|
|
|
uuid_t uuid;
|
|
|
|
int users;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct vfio_pci_mmap_vma {
|
|
|
|
struct vm_area_struct *vma;
|
|
|
|
struct list_head vma_next;
|
|
|
|
};
|
|
|
|
|
2015-04-07 17:14:40 +00:00
|
|
|
static inline bool vfio_vga_disabled(void)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_VFIO_PCI_VGA
|
|
|
|
return disable_vga;
|
|
|
|
#else
|
|
|
|
return true;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2015-04-07 17:14:41 +00:00
|
|
|
/*
|
|
|
|
* Our VGA arbiter participation is limited since we don't know anything
|
|
|
|
* about the device itself. However, if the device is the only VGA device
|
|
|
|
* downstream of a bridge and VFIO VGA support is disabled, then we can
|
|
|
|
* safely return legacy VGA IO and memory as not decoded since the user
|
|
|
|
* has no way to get to it and routing can be disabled externally at the
|
|
|
|
* bridge.
|
|
|
|
*/
|
VFIO update for v5.15-rc1
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching
to support future vendor provided vfio-pci variants (Yishai Hadas,
Max Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first
open, last close, and device sets (Jason Gunthorpe, Max Gurtovoy,
Yishai Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
-----BEGIN PGP SIGNATURE-----
iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmEvwWkbHGFsZXgud2ls
bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi+1UP/3CRizghroINVYR+cJ99
Tjz7lB/wlzxmRfX+SL4NAVe1SSB2VeCgU4B0PF6kywELLS8OhCO3HXYXVsz244fW
Gk5UIns86+TFTrfCOMpwYBV0P86zuaa1ZnvCnkhMK1i2pTZ+oX8hUH1Yj5clHuU+
YgC7JfEuTIAX73q2bC/llLvNE9ke1QCoDX3+HAH87ttqutnRWcnnq56PTEqwe+EW
eMA+glB1UG6JAqXxoJET4155arNOny1/ZMprfBr3YXZTiXDF/lSzuMyUtbp526Sf
hsvlnqtE6TCdfKbog0Lxckl+8E9NCq8jzFBKiZhbccrQv3vVaoP6dOsPWcT35Kp1
IjzMLiHIbl4wXOL+Xap/biz3LCM5BMdT/OhW5LUC007zggK71ndRvb9F8ptW83Bv
0Uh9DNv7YIQ0su3JHZEsJ3qPFXQXceP199UiADOGSeV8U1Qig3YKsHUDMuALfFvN
t+NleeJ4qCWao+W4VCfyDfKurVnMj/cThXiDEWEeq5gMOO+6YKBIFWJVKFxUYDbf
MgGdg0nQTUECuXKXxLD4c1HAWH9xi207OnLvhW1Icywp20MsYqOWt0vhg+PRdMBT
DK6STxP18aQxCaOuQN9Vf81LjhXNTeg+xt3mMyViOZPcKfX6/wAC9qLt4MucJDdw
FBfOz2UL2F56dhAYT+1vHoUM
=nzK7
-----END PGP SIGNATURE-----
Merge tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching to
support future vendor provided vfio-pci variants (Yishai Hadas, Max
Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first open,
last close, and device sets (Jason Gunthorpe, Max Gurtovoy, Yishai
Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
* tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio: (37 commits)
vfio/pci: Introduce vfio_pci_core.ko
vfio: Use kconfig if XX/endif blocks instead of repeating 'depends on'
vfio: Use select for eventfd
PCI / VFIO: Add 'override_only' support for VFIO PCI sub system
PCI: Add 'override_only' field to struct pci_device_id
vfio/pci: Move module parameters to vfio_pci.c
vfio/pci: Move igd initialization to vfio_pci.c
vfio/pci: Split the pci_driver code out of vfio_pci_core.c
vfio/pci: Include vfio header in vfio_pci_core.h
vfio/pci: Rename ops functions to fit core namings
vfio/pci: Rename vfio_pci_device to vfio_pci_core_device
vfio/pci: Rename vfio_pci_private.h to vfio_pci_core.h
vfio/pci: Rename vfio_pci.c to vfio_pci_core.c
vfio/ap_ops: Convert to use vfio_register_group_dev()
s390/vfio-ap: replace open coded locks for VFIO_GROUP_NOTIFY_SET_KVM notification
s390/vfio-ap: r/w lock for PQAP interception handler function pointer
vfio/type1: Fix vfio_find_dma_valid return
vfio-pci/zdev: Remove repeated verbose license text
vfio: platform: reset: Convert to SPDX identifier
vfio: Remove struct vfio_device_ops open/release
...
2021-09-02 20:41:33 +00:00
|
|
|
static unsigned int vfio_pci_set_decode(struct pci_dev *pdev, bool single_vga)
|
2015-04-07 17:14:41 +00:00
|
|
|
{
|
VFIO update for v5.15-rc1
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching
to support future vendor provided vfio-pci variants (Yishai Hadas,
Max Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first
open, last close, and device sets (Jason Gunthorpe, Max Gurtovoy,
Yishai Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
-----BEGIN PGP SIGNATURE-----
iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmEvwWkbHGFsZXgud2ls
bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi+1UP/3CRizghroINVYR+cJ99
Tjz7lB/wlzxmRfX+SL4NAVe1SSB2VeCgU4B0PF6kywELLS8OhCO3HXYXVsz244fW
Gk5UIns86+TFTrfCOMpwYBV0P86zuaa1ZnvCnkhMK1i2pTZ+oX8hUH1Yj5clHuU+
YgC7JfEuTIAX73q2bC/llLvNE9ke1QCoDX3+HAH87ttqutnRWcnnq56PTEqwe+EW
eMA+glB1UG6JAqXxoJET4155arNOny1/ZMprfBr3YXZTiXDF/lSzuMyUtbp526Sf
hsvlnqtE6TCdfKbog0Lxckl+8E9NCq8jzFBKiZhbccrQv3vVaoP6dOsPWcT35Kp1
IjzMLiHIbl4wXOL+Xap/biz3LCM5BMdT/OhW5LUC007zggK71ndRvb9F8ptW83Bv
0Uh9DNv7YIQ0su3JHZEsJ3qPFXQXceP199UiADOGSeV8U1Qig3YKsHUDMuALfFvN
t+NleeJ4qCWao+W4VCfyDfKurVnMj/cThXiDEWEeq5gMOO+6YKBIFWJVKFxUYDbf
MgGdg0nQTUECuXKXxLD4c1HAWH9xi207OnLvhW1Icywp20MsYqOWt0vhg+PRdMBT
DK6STxP18aQxCaOuQN9Vf81LjhXNTeg+xt3mMyViOZPcKfX6/wAC9qLt4MucJDdw
FBfOz2UL2F56dhAYT+1vHoUM
=nzK7
-----END PGP SIGNATURE-----
Merge tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching to
support future vendor provided vfio-pci variants (Yishai Hadas, Max
Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first open,
last close, and device sets (Jason Gunthorpe, Max Gurtovoy, Yishai
Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
* tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio: (37 commits)
vfio/pci: Introduce vfio_pci_core.ko
vfio: Use kconfig if XX/endif blocks instead of repeating 'depends on'
vfio: Use select for eventfd
PCI / VFIO: Add 'override_only' support for VFIO PCI sub system
PCI: Add 'override_only' field to struct pci_device_id
vfio/pci: Move module parameters to vfio_pci.c
vfio/pci: Move igd initialization to vfio_pci.c
vfio/pci: Split the pci_driver code out of vfio_pci_core.c
vfio/pci: Include vfio header in vfio_pci_core.h
vfio/pci: Rename ops functions to fit core namings
vfio/pci: Rename vfio_pci_device to vfio_pci_core_device
vfio/pci: Rename vfio_pci_private.h to vfio_pci_core.h
vfio/pci: Rename vfio_pci.c to vfio_pci_core.c
vfio/ap_ops: Convert to use vfio_register_group_dev()
s390/vfio-ap: replace open coded locks for VFIO_GROUP_NOTIFY_SET_KVM notification
s390/vfio-ap: r/w lock for PQAP interception handler function pointer
vfio/type1: Fix vfio_find_dma_valid return
vfio-pci/zdev: Remove repeated verbose license text
vfio: platform: reset: Convert to SPDX identifier
vfio: Remove struct vfio_device_ops open/release
...
2021-09-02 20:41:33 +00:00
|
|
|
struct pci_dev *tmp = NULL;
|
2015-04-07 17:14:41 +00:00
|
|
|
unsigned char max_busnr;
|
|
|
|
unsigned int decodes;
|
|
|
|
|
|
|
|
if (single_vga || !vfio_vga_disabled() || pci_is_root_bus(pdev->bus))
|
|
|
|
return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
|
|
|
|
VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
|
|
|
|
|
|
|
|
max_busnr = pci_bus_max_busnr(pdev->bus);
|
|
|
|
decodes = VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
|
|
|
|
|
|
|
|
while ((tmp = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, tmp)) != NULL) {
|
|
|
|
if (tmp == pdev ||
|
|
|
|
pci_domain_nr(tmp->bus) != pci_domain_nr(pdev->bus) ||
|
|
|
|
pci_is_root_bus(tmp->bus))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (tmp->bus->number >= pdev->bus->number &&
|
|
|
|
tmp->bus->number <= max_busnr) {
|
|
|
|
pci_dev_put(tmp);
|
|
|
|
decodes |= VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return decodes;
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
|
2016-06-30 07:21:24 +00:00
|
|
|
{
|
|
|
|
struct resource *res;
|
2019-09-27 23:43:08 +00:00
|
|
|
int i;
|
2016-06-30 07:21:24 +00:00
|
|
|
struct vfio_pci_dummy_resource *dummy_res;
|
|
|
|
|
2019-09-27 23:43:08 +00:00
|
|
|
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
|
|
|
|
int bar = i + PCI_STD_RESOURCES;
|
|
|
|
|
|
|
|
res = &vdev->pdev->resource[bar];
|
2016-06-30 07:21:24 +00:00
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
|
|
|
|
goto no_mmap;
|
|
|
|
|
|
|
|
if (!(res->flags & IORESOURCE_MEM))
|
|
|
|
goto no_mmap;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The PCI core shouldn't set up a resource with a
|
|
|
|
* type but zero size. But there may be bugs that
|
|
|
|
* cause us to do that.
|
|
|
|
*/
|
|
|
|
if (!resource_size(res))
|
|
|
|
goto no_mmap;
|
|
|
|
|
|
|
|
if (resource_size(res) >= PAGE_SIZE) {
|
|
|
|
vdev->bar_mmap_supported[bar] = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(res->start & ~PAGE_MASK)) {
|
|
|
|
/*
|
|
|
|
* Add a dummy resource to reserve the remainder
|
|
|
|
* of the exclusive page in case that hot-add
|
|
|
|
* device's bar is assigned into it.
|
|
|
|
*/
|
2023-01-08 15:44:24 +00:00
|
|
|
dummy_res =
|
|
|
|
kzalloc(sizeof(*dummy_res), GFP_KERNEL_ACCOUNT);
|
2016-06-30 07:21:24 +00:00
|
|
|
if (dummy_res == NULL)
|
|
|
|
goto no_mmap;
|
|
|
|
|
|
|
|
dummy_res->resource.name = "vfio sub-page reserved";
|
|
|
|
dummy_res->resource.start = res->end + 1;
|
|
|
|
dummy_res->resource.end = res->start + PAGE_SIZE - 1;
|
|
|
|
dummy_res->resource.flags = res->flags;
|
|
|
|
if (request_resource(res->parent,
|
|
|
|
&dummy_res->resource)) {
|
|
|
|
kfree(dummy_res);
|
|
|
|
goto no_mmap;
|
|
|
|
}
|
|
|
|
dummy_res->index = bar;
|
|
|
|
list_add(&dummy_res->res_next,
|
|
|
|
&vdev->dummy_resources_list);
|
|
|
|
vdev->bar_mmap_supported[bar] = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Here we don't handle the case when the BAR is not page
|
|
|
|
* aligned because we can't expect the BAR will be
|
|
|
|
* assigned into the same location in a page in guest
|
|
|
|
* when we passthrough the BAR. And it's hard to access
|
|
|
|
* this BAR in userspace because we have no way to get
|
|
|
|
* the BAR's location in a page.
|
|
|
|
*/
|
|
|
|
no_mmap:
|
|
|
|
vdev->bar_mmap_supported[bar] = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-08-06 01:19:06 +00:00
|
|
|
struct vfio_pci_group_info;
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set);
|
2021-08-06 01:19:06 +00:00
|
|
|
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
|
2023-07-18 10:55:42 +00:00
|
|
|
struct vfio_pci_group_info *groups,
|
|
|
|
struct iommufd_ctx *iommufd_ctx);
|
2014-08-07 17:12:07 +00:00
|
|
|
|
vfio/pci: Hide broken INTx support from user
INTx masking has two components, the first is that we need the ability
to prevent the device from continuing to assert INTx. This is
provided via the DisINTx bit in the command register and is the only
thing we can really probe for when testing if INTx masking is
supported. The second component is that the device needs to indicate
if INTx is asserted via the interrupt status bit in the device status
register. With these two features we can generically determine if one
of the devices we own is asserting INTx, signal the user, and mask the
interrupt while the user services the device.
Generally if one or both of these components is broken we resort to
APIC level interrupt masking, which requires an exclusive interrupt
since we have no way to determine the source of the interrupt in a
shared configuration. This often makes it difficult or impossible to
configure the system for userspace use of the device, for an interrupt
mode that the user may not need.
One possible configuration of broken INTx masking is that the DisINTx
support is fully functional, but the interrupt status bit never
signals interrupt assertion. In this case we do have the ability to
prevent the device from asserting INTx, but lack the ability to
identify the interrupt source. For this case we can simply pretend
that the device lacks INTx support entirely, keeping DisINTx set on
the physical device, virtualizing this bit for the user, and
virtualizing the interrupt pin register to indicate no INTx support.
We already support virtualization of the DisINTx bit and already
virtualize the interrupt pin for platforms without INTx support. By
tying these components together, setting DisINTx on open and reset,
and identifying devices broken in this particular way, we can provide
support for them w/o the handicap of APIC level INTx masking.
Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being
broken in this specific way. We leave the vfio-pci.nointxmask option
as a mechanism to bypass this support, enabling INTx on the device
with all the requirements of APIC level masking.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
2016-03-24 19:05:18 +00:00
|
|
|
/*
|
|
|
|
* INTx masking requires the ability to disable INTx signaling via PCI_COMMAND
|
|
|
|
* _and_ the ability detect when the device is asserting INTx via PCI_STATUS.
|
|
|
|
* If a device implements the former but not the latter we would typically
|
|
|
|
* expect broken_intx_masking be set and require an exclusive interrupt.
|
|
|
|
* However since we do have control of the device's ability to assert INTx,
|
|
|
|
* we can instead pretend that the device does not implement INTx, virtualizing
|
|
|
|
* the pin register to report zero and maintaining DisINTx set on the host.
|
|
|
|
*/
|
|
|
|
static bool vfio_pci_nointx(struct pci_dev *pdev)
|
|
|
|
{
|
|
|
|
switch (pdev->vendor) {
|
|
|
|
case PCI_VENDOR_ID_INTEL:
|
|
|
|
switch (pdev->device) {
|
2017-06-13 15:22:57 +00:00
|
|
|
/* All i40e (XL710/X710/XXV710) 10/20/25/40GbE NICs */
|
vfio/pci: Hide broken INTx support from user
INTx masking has two components, the first is that we need the ability
to prevent the device from continuing to assert INTx. This is
provided via the DisINTx bit in the command register and is the only
thing we can really probe for when testing if INTx masking is
supported. The second component is that the device needs to indicate
if INTx is asserted via the interrupt status bit in the device status
register. With these two features we can generically determine if one
of the devices we own is asserting INTx, signal the user, and mask the
interrupt while the user services the device.
Generally if one or both of these components is broken we resort to
APIC level interrupt masking, which requires an exclusive interrupt
since we have no way to determine the source of the interrupt in a
shared configuration. This often makes it difficult or impossible to
configure the system for userspace use of the device, for an interrupt
mode that the user may not need.
One possible configuration of broken INTx masking is that the DisINTx
support is fully functional, but the interrupt status bit never
signals interrupt assertion. In this case we do have the ability to
prevent the device from asserting INTx, but lack the ability to
identify the interrupt source. For this case we can simply pretend
that the device lacks INTx support entirely, keeping DisINTx set on
the physical device, virtualizing this bit for the user, and
virtualizing the interrupt pin register to indicate no INTx support.
We already support virtualization of the DisINTx bit and already
virtualize the interrupt pin for platforms without INTx support. By
tying these components together, setting DisINTx on open and reset,
and identifying devices broken in this particular way, we can provide
support for them w/o the handicap of APIC level INTx masking.
Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being
broken in this specific way. We leave the vfio-pci.nointxmask option
as a mechanism to bypass this support, enabling INTx on the device
with all the requirements of APIC level masking.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
2016-03-24 19:05:18 +00:00
|
|
|
case 0x1572:
|
|
|
|
case 0x1574:
|
|
|
|
case 0x1580 ... 0x1581:
|
2017-06-13 15:22:57 +00:00
|
|
|
case 0x1583 ... 0x158b:
|
vfio/pci: Hide broken INTx support from user
INTx masking has two components, the first is that we need the ability
to prevent the device from continuing to assert INTx. This is
provided via the DisINTx bit in the command register and is the only
thing we can really probe for when testing if INTx masking is
supported. The second component is that the device needs to indicate
if INTx is asserted via the interrupt status bit in the device status
register. With these two features we can generically determine if one
of the devices we own is asserting INTx, signal the user, and mask the
interrupt while the user services the device.
Generally if one or both of these components is broken we resort to
APIC level interrupt masking, which requires an exclusive interrupt
since we have no way to determine the source of the interrupt in a
shared configuration. This often makes it difficult or impossible to
configure the system for userspace use of the device, for an interrupt
mode that the user may not need.
One possible configuration of broken INTx masking is that the DisINTx
support is fully functional, but the interrupt status bit never
signals interrupt assertion. In this case we do have the ability to
prevent the device from asserting INTx, but lack the ability to
identify the interrupt source. For this case we can simply pretend
that the device lacks INTx support entirely, keeping DisINTx set on
the physical device, virtualizing this bit for the user, and
virtualizing the interrupt pin register to indicate no INTx support.
We already support virtualization of the DisINTx bit and already
virtualize the interrupt pin for platforms without INTx support. By
tying these components together, setting DisINTx on open and reset,
and identifying devices broken in this particular way, we can provide
support for them w/o the handicap of APIC level INTx masking.
Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being
broken in this specific way. We leave the vfio-pci.nointxmask option
as a mechanism to bypass this support, enabling INTx on the device
with all the requirements of APIC level masking.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
2016-03-24 19:05:18 +00:00
|
|
|
case 0x37d0 ... 0x37d2:
|
2020-07-27 19:43:37 +00:00
|
|
|
/* X550 */
|
|
|
|
case 0x1563:
|
vfio/pci: Hide broken INTx support from user
INTx masking has two components, the first is that we need the ability
to prevent the device from continuing to assert INTx. This is
provided via the DisINTx bit in the command register and is the only
thing we can really probe for when testing if INTx masking is
supported. The second component is that the device needs to indicate
if INTx is asserted via the interrupt status bit in the device status
register. With these two features we can generically determine if one
of the devices we own is asserting INTx, signal the user, and mask the
interrupt while the user services the device.
Generally if one or both of these components is broken we resort to
APIC level interrupt masking, which requires an exclusive interrupt
since we have no way to determine the source of the interrupt in a
shared configuration. This often makes it difficult or impossible to
configure the system for userspace use of the device, for an interrupt
mode that the user may not need.
One possible configuration of broken INTx masking is that the DisINTx
support is fully functional, but the interrupt status bit never
signals interrupt assertion. In this case we do have the ability to
prevent the device from asserting INTx, but lack the ability to
identify the interrupt source. For this case we can simply pretend
that the device lacks INTx support entirely, keeping DisINTx set on
the physical device, virtualizing this bit for the user, and
virtualizing the interrupt pin register to indicate no INTx support.
We already support virtualization of the DisINTx bit and already
virtualize the interrupt pin for platforms without INTx support. By
tying these components together, setting DisINTx on open and reset,
and identifying devices broken in this particular way, we can provide
support for them w/o the handicap of APIC level INTx masking.
Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being
broken in this specific way. We leave the vfio-pci.nointxmask option
as a mechanism to bypass this support, enabling INTx on the device
with all the requirements of APIC level masking.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
2016-03-24 19:05:18 +00:00
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static void vfio_pci_probe_power_state(struct vfio_pci_core_device *vdev)
|
2019-02-09 20:43:30 +00:00
|
|
|
{
|
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
|
|
u16 pmcsr;
|
|
|
|
|
|
|
|
if (!pdev->pm_cap)
|
|
|
|
return;
|
|
|
|
|
|
|
|
pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmcsr);
|
|
|
|
|
|
|
|
vdev->needs_pm_restore = !(pmcsr & PCI_PM_CTRL_NO_SOFT_RESET);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* pci_set_power_state() wrapper handling devices which perform a soft reset on
|
|
|
|
* D3->D0 transition. Save state prior to D0/1/2->D3, stash it on the vdev,
|
|
|
|
* restore when returned to D0. Saved separately from pci_saved_state for use
|
|
|
|
* by PM capability emulation and separately from pci_dev internal saved state
|
|
|
|
* to avoid it being overwritten and consumed around other resets.
|
|
|
|
*/
|
2021-08-26 10:39:02 +00:00
|
|
|
int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t state)
|
2019-02-09 20:43:30 +00:00
|
|
|
{
|
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
|
|
bool needs_restore = false, needs_save = false;
|
|
|
|
int ret;
|
|
|
|
|
vfio/pci: Change the PF power state to D0 before enabling VFs
According to [PCIe v5 9.6.2] for PF Device Power Management States
"The PF's power management state (D-state) has global impact on its
associated VFs. If a VF does not implement the Power Management
Capability, then it behaves as if it is in an equivalent
power state of its associated PF.
If a VF implements the Power Management Capability, the Device behavior
is undefined if the PF is placed in a lower power state than the VF.
Software should avoid this situation by placing all VFs in lower power
state before lowering their associated PF's power state."
From the vfio driver side, user can enable SR-IOV when the PF is in D3hot
state. If VF does not implement the Power Management Capability, then
the VF will be actually in D3hot state and then the VF BAR access will
fail. If VF implements the Power Management Capability, then VF will
assume that its current power state is D0 when the PF is D3hot and
in this case, the behavior is undefined.
To support PF power management, we need to create power management
dependency between PF and its VF's. The runtime power management support
may help with this where power management dependencies are supported
through device links. But till we have such support in place, we can
disallow the PF to go into low power state, if PF has VF enabled.
There can be a case, where user first enables the VF's and then
disables the VF's. If there is no user of PF, then the PF can put into
D3hot state again. But with this patch, the PF will still be in D0
state after disabling VF's since detecting this case inside
vfio_pci_core_sriov_configure() requires access to
struct vfio_device::open_count along with its locks. But the subsequent
patches related to runtime PM will handle this case since runtime PM
maintains its own usage count.
Also, vfio_pci_core_sriov_configure() can be called at any time
(with and without vfio pci device user), so the power state change
and SR-IOV enablement need to be protected with the required locks.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:10 +00:00
|
|
|
/* Prevent changing power state for PFs with VFs enabled */
|
|
|
|
if (pci_num_vf(pdev) && state > PCI_D0)
|
|
|
|
return -EBUSY;
|
|
|
|
|
2019-02-09 20:43:30 +00:00
|
|
|
if (vdev->needs_pm_restore) {
|
|
|
|
if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) {
|
|
|
|
pci_save_state(pdev);
|
|
|
|
needs_save = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pdev->current_state >= PCI_D3hot && state <= PCI_D0)
|
|
|
|
needs_restore = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = pci_set_power_state(pdev, state);
|
|
|
|
|
|
|
|
if (!ret) {
|
|
|
|
/* D3 might be unsupported via quirk, skip unless in D3 */
|
|
|
|
if (needs_save && pdev->current_state >= PCI_D3hot) {
|
2022-02-17 12:21:06 +00:00
|
|
|
/*
|
|
|
|
* The current PCI state will be saved locally in
|
|
|
|
* 'pm_save' during the D3hot transition. When the
|
|
|
|
* device state is changed to D0 again with the current
|
|
|
|
* function, then pci_store_saved_state() will restore
|
|
|
|
* the state and will free the memory pointed by
|
|
|
|
* 'pm_save'. There are few cases where the PCI power
|
|
|
|
* state can be changed to D0 without the involvement
|
|
|
|
* of the driver. For these cases, free the earlier
|
|
|
|
* allocated memory first before overwriting 'pm_save'
|
|
|
|
* to prevent the memory leak.
|
|
|
|
*/
|
|
|
|
kfree(vdev->pm_save);
|
2019-02-09 20:43:30 +00:00
|
|
|
vdev->pm_save = pci_store_saved_state(pdev);
|
|
|
|
} else if (needs_restore) {
|
|
|
|
pci_load_and_free_saved_state(pdev, &vdev->pm_save);
|
|
|
|
pci_restore_state(pdev);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
static int vfio_pci_runtime_pm_entry(struct vfio_pci_core_device *vdev,
|
|
|
|
struct eventfd_ctx *efdctx)
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The vdev power related flags are protected with 'memory_lock'
|
|
|
|
* semaphore.
|
|
|
|
*/
|
|
|
|
vfio_pci_zap_and_down_write_memory_lock(vdev);
|
|
|
|
if (vdev->pm_runtime_engaged) {
|
|
|
|
up_write(&vdev->memory_lock);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
vdev->pm_runtime_engaged = true;
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
vdev->pm_wake_eventfd_ctx = efdctx;
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
pm_runtime_put_noidle(&vdev->pdev->dev);
|
|
|
|
up_write(&vdev->memory_lock);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_pci_core_pm_entry(struct vfio_device *device, u32 flags,
|
|
|
|
void __user *arg, size_t argsz)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(device, struct vfio_pci_core_device, vdev);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
|
|
|
|
if (ret != 1)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inside vfio_pci_runtime_pm_entry(), only the runtime PM usage count
|
|
|
|
* will be decremented. The pm_runtime_put() will be invoked again
|
|
|
|
* while returning from the ioctl and then the device can go into
|
|
|
|
* runtime suspended state.
|
|
|
|
*/
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
return vfio_pci_runtime_pm_entry(vdev, NULL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_pci_core_pm_entry_with_wakeup(
|
|
|
|
struct vfio_device *device, u32 flags,
|
|
|
|
struct vfio_device_low_power_entry_with_wakeup __user *arg,
|
|
|
|
size_t argsz)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(device, struct vfio_pci_core_device, vdev);
|
|
|
|
struct vfio_device_low_power_entry_with_wakeup entry;
|
|
|
|
struct eventfd_ctx *efdctx;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
|
|
|
|
sizeof(entry));
|
|
|
|
if (ret != 1)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
if (copy_from_user(&entry, arg, sizeof(entry)))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
if (entry.wakeup_eventfd < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
efdctx = eventfd_ctx_fdget(entry.wakeup_eventfd);
|
|
|
|
if (IS_ERR(efdctx))
|
|
|
|
return PTR_ERR(efdctx);
|
|
|
|
|
|
|
|
ret = vfio_pci_runtime_pm_entry(vdev, efdctx);
|
|
|
|
if (ret)
|
|
|
|
eventfd_ctx_put(efdctx);
|
|
|
|
|
|
|
|
return ret;
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void __vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
|
|
|
|
{
|
|
|
|
if (vdev->pm_runtime_engaged) {
|
|
|
|
vdev->pm_runtime_engaged = false;
|
|
|
|
pm_runtime_get_noresume(&vdev->pdev->dev);
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
|
|
|
|
if (vdev->pm_wake_eventfd_ctx) {
|
|
|
|
eventfd_ctx_put(vdev->pm_wake_eventfd_ctx);
|
|
|
|
vdev->pm_wake_eventfd_ctx = NULL;
|
|
|
|
}
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void vfio_pci_runtime_pm_exit(struct vfio_pci_core_device *vdev)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* The vdev power related flags are protected with 'memory_lock'
|
|
|
|
* semaphore.
|
|
|
|
*/
|
|
|
|
down_write(&vdev->memory_lock);
|
|
|
|
__vfio_pci_runtime_pm_exit(vdev);
|
|
|
|
up_write(&vdev->memory_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_pci_core_pm_exit(struct vfio_device *device, u32 flags,
|
|
|
|
void __user *arg, size_t argsz)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(device, struct vfio_pci_core_device, vdev);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0);
|
|
|
|
if (ret != 1)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The device is always in the active state here due to pm wrappers
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
* around ioctls. If the device had entered a low power state and
|
|
|
|
* pm_wake_eventfd_ctx is valid, vfio_pci_core_runtime_resume() has
|
|
|
|
* already signaled the eventfd and exited low power mode itself.
|
|
|
|
* pm_runtime_engaged protects the redundant call here.
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
*/
|
|
|
|
vfio_pci_runtime_pm_exit(vdev);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
vfio/pci: Mask INTx during runtime suspend
This patch adds INTx handling during runtime suspend/resume.
All the suspend/resume related code for the user to put the device
into the low power state will be added in subsequent patches.
The INTx lines may be shared among devices. Whenever any INTx
interrupt comes for the VFIO devices, then vfio_intx_handler() will be
called for each device sharing the interrupt. Inside vfio_intx_handler(),
it calls pci_check_and_mask_intx() and checks if the interrupt has
been generated for the current device. Now, if the device is already
in the D3cold state, then the config space can not be read. Attempt
to read config space in D3cold state can cause system unresponsiveness
in a few systems. To prevent this, mask INTx in runtime suspend callback,
and unmask the same in runtime resume callback. If INTx has been already
masked, then no handling is needed in runtime suspend/resume callbacks.
'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated
to return true if the INTx vfio_pci_irq_ctx.masked value is changed
inside this function.
For the runtime suspend which is triggered for the no user of VFIO
device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these
callbacks won't do anything.
The MSI/MSI-X are not shared so similar handling should not be
needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
without doing any device-specific config access. When the user performs
any config access or IOCTL after receiving the eventfd notification,
then the device will be moved to the D0 state first before
servicing any request.
Another option was to check this flag 'pm_intx_masked' inside
vfio_intx_handler() instead of masking the interrupts. This flag
is being set inside the runtime_suspend callback but the device
can be in non-D3cold state (for example, if the user has disabled D3cold
explicitly by sysfs, the D3cold is not supported in the platform, etc.).
Also, in D3cold supported case, the device will be in D0 till the
PCI core moves the device into D3cold. In this case, there is
a possibility that the device can generate an interrupt. Adding check
in the IRQ handler will not clear the IRQ status and the interrupt
line will still be asserted. This can cause interrupt flooding.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:48 +00:00
|
|
|
#ifdef CONFIG_PM
|
|
|
|
static int vfio_pci_core_runtime_suspend(struct device *dev)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
|
|
|
|
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
down_write(&vdev->memory_lock);
|
|
|
|
/*
|
|
|
|
* The user can move the device into D3hot state before invoking
|
|
|
|
* power management IOCTL. Move the device into D0 state here and then
|
|
|
|
* the pci-driver core runtime PM suspend function will move the device
|
|
|
|
* into the low power state. Also, for the devices which have
|
|
|
|
* NoSoftRst-, it will help in restoring the original state
|
|
|
|
* (saved locally in 'vdev->pm_save').
|
|
|
|
*/
|
|
|
|
vfio_pci_set_power_state(vdev, PCI_D0);
|
|
|
|
up_write(&vdev->memory_lock);
|
|
|
|
|
vfio/pci: Mask INTx during runtime suspend
This patch adds INTx handling during runtime suspend/resume.
All the suspend/resume related code for the user to put the device
into the low power state will be added in subsequent patches.
The INTx lines may be shared among devices. Whenever any INTx
interrupt comes for the VFIO devices, then vfio_intx_handler() will be
called for each device sharing the interrupt. Inside vfio_intx_handler(),
it calls pci_check_and_mask_intx() and checks if the interrupt has
been generated for the current device. Now, if the device is already
in the D3cold state, then the config space can not be read. Attempt
to read config space in D3cold state can cause system unresponsiveness
in a few systems. To prevent this, mask INTx in runtime suspend callback,
and unmask the same in runtime resume callback. If INTx has been already
masked, then no handling is needed in runtime suspend/resume callbacks.
'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated
to return true if the INTx vfio_pci_irq_ctx.masked value is changed
inside this function.
For the runtime suspend which is triggered for the no user of VFIO
device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these
callbacks won't do anything.
The MSI/MSI-X are not shared so similar handling should not be
needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
without doing any device-specific config access. When the user performs
any config access or IOCTL after receiving the eventfd notification,
then the device will be moved to the D0 state first before
servicing any request.
Another option was to check this flag 'pm_intx_masked' inside
vfio_intx_handler() instead of masking the interrupts. This flag
is being set inside the runtime_suspend callback but the device
can be in non-D3cold state (for example, if the user has disabled D3cold
explicitly by sysfs, the D3cold is not supported in the platform, etc.).
Also, in D3cold supported case, the device will be in D0 till the
PCI core moves the device into D3cold. In this case, there is
a possibility that the device can generate an interrupt. Adding check
in the IRQ handler will not clear the IRQ status and the interrupt
line will still be asserted. This can cause interrupt flooding.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:48 +00:00
|
|
|
/*
|
|
|
|
* If INTx is enabled, then mask INTx before going into the runtime
|
|
|
|
* suspended state and unmask the same in the runtime resume.
|
|
|
|
* If INTx has already been masked by the user, then
|
|
|
|
* vfio_pci_intx_mask() will return false and in that case, INTx
|
|
|
|
* should not be unmasked in the runtime resume.
|
|
|
|
*/
|
|
|
|
vdev->pm_intx_masked = ((vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) &&
|
|
|
|
vfio_pci_intx_mask(vdev));
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_pci_core_runtime_resume(struct device *dev)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev = dev_get_drvdata(dev);
|
|
|
|
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
/*
|
|
|
|
* Resume with a pm_wake_eventfd_ctx signals the eventfd and exit
|
|
|
|
* low power mode.
|
|
|
|
*/
|
|
|
|
down_write(&vdev->memory_lock);
|
|
|
|
if (vdev->pm_wake_eventfd_ctx) {
|
2023-11-22 12:48:23 +00:00
|
|
|
eventfd_signal(vdev->pm_wake_eventfd_ctx);
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
__vfio_pci_runtime_pm_exit(vdev);
|
|
|
|
}
|
|
|
|
up_write(&vdev->memory_lock);
|
|
|
|
|
vfio/pci: Mask INTx during runtime suspend
This patch adds INTx handling during runtime suspend/resume.
All the suspend/resume related code for the user to put the device
into the low power state will be added in subsequent patches.
The INTx lines may be shared among devices. Whenever any INTx
interrupt comes for the VFIO devices, then vfio_intx_handler() will be
called for each device sharing the interrupt. Inside vfio_intx_handler(),
it calls pci_check_and_mask_intx() and checks if the interrupt has
been generated for the current device. Now, if the device is already
in the D3cold state, then the config space can not be read. Attempt
to read config space in D3cold state can cause system unresponsiveness
in a few systems. To prevent this, mask INTx in runtime suspend callback,
and unmask the same in runtime resume callback. If INTx has been already
masked, then no handling is needed in runtime suspend/resume callbacks.
'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated
to return true if the INTx vfio_pci_irq_ctx.masked value is changed
inside this function.
For the runtime suspend which is triggered for the no user of VFIO
device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these
callbacks won't do anything.
The MSI/MSI-X are not shared so similar handling should not be
needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
without doing any device-specific config access. When the user performs
any config access or IOCTL after receiving the eventfd notification,
then the device will be moved to the D0 state first before
servicing any request.
Another option was to check this flag 'pm_intx_masked' inside
vfio_intx_handler() instead of masking the interrupts. This flag
is being set inside the runtime_suspend callback but the device
can be in non-D3cold state (for example, if the user has disabled D3cold
explicitly by sysfs, the D3cold is not supported in the platform, etc.).
Also, in D3cold supported case, the device will be in D0 till the
PCI core moves the device into D3cold. In this case, there is
a possibility that the device can generate an interrupt. Adding check
in the IRQ handler will not clear the IRQ status and the interrupt
line will still be asserted. This can cause interrupt flooding.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:48 +00:00
|
|
|
if (vdev->pm_intx_masked)
|
|
|
|
vfio_pci_intx_unmask(vdev);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif /* CONFIG_PM */
|
|
|
|
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
/*
|
|
|
|
* The pci-driver core runtime PM routines always save the device state
|
|
|
|
* before going into suspended state. If the device is going into low power
|
|
|
|
* state with only with runtime PM ops, then no explicit handling is needed
|
|
|
|
* for the devices which have NoSoftRst-.
|
|
|
|
*/
|
vfio/pci: Mask INTx during runtime suspend
This patch adds INTx handling during runtime suspend/resume.
All the suspend/resume related code for the user to put the device
into the low power state will be added in subsequent patches.
The INTx lines may be shared among devices. Whenever any INTx
interrupt comes for the VFIO devices, then vfio_intx_handler() will be
called for each device sharing the interrupt. Inside vfio_intx_handler(),
it calls pci_check_and_mask_intx() and checks if the interrupt has
been generated for the current device. Now, if the device is already
in the D3cold state, then the config space can not be read. Attempt
to read config space in D3cold state can cause system unresponsiveness
in a few systems. To prevent this, mask INTx in runtime suspend callback,
and unmask the same in runtime resume callback. If INTx has been already
masked, then no handling is needed in runtime suspend/resume callbacks.
'pm_intx_masked' tracks this, and vfio_pci_intx_mask() has been updated
to return true if the INTx vfio_pci_irq_ctx.masked value is changed
inside this function.
For the runtime suspend which is triggered for the no user of VFIO
device, the 'irq_type' will be VFIO_PCI_NUM_IRQS and these
callbacks won't do anything.
The MSI/MSI-X are not shared so similar handling should not be
needed for MSI/MSI-X. vfio_msihandler() triggers eventfd_signal()
without doing any device-specific config access. When the user performs
any config access or IOCTL after receiving the eventfd notification,
then the device will be moved to the D0 state first before
servicing any request.
Another option was to check this flag 'pm_intx_masked' inside
vfio_intx_handler() instead of masking the interrupts. This flag
is being set inside the runtime_suspend callback but the device
can be in non-D3cold state (for example, if the user has disabled D3cold
explicitly by sysfs, the D3cold is not supported in the platform, etc.).
Also, in D3cold supported case, the device will be in D0 till the
PCI core moves the device into D3cold. In this case, there is
a possibility that the device can generate an interrupt. Adding check
in the IRQ handler will not clear the IRQ status and the interrupt
line will still be asserted. This can cause interrupt flooding.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-4-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:48 +00:00
|
|
|
static const struct dev_pm_ops vfio_pci_core_pm_ops = {
|
|
|
|
SET_RUNTIME_PM_OPS(vfio_pci_core_runtime_suspend,
|
|
|
|
vfio_pci_core_runtime_resume,
|
|
|
|
NULL)
|
|
|
|
};
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
|
2021-08-26 10:39:06 +00:00
|
|
|
int vfio_pci_core_enable(struct vfio_pci_core_device *vdev)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
|
|
int ret;
|
|
|
|
u16 cmd;
|
|
|
|
u8 msix_pos;
|
|
|
|
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (!disable_idle_d3) {
|
|
|
|
ret = pm_runtime_resume_and_get(&pdev->dev);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
}
|
2015-04-07 17:14:46 +00:00
|
|
|
|
2014-08-07 17:12:02 +00:00
|
|
|
/* Don't allow our initial saved state to include busmaster */
|
|
|
|
pci_clear_master(pdev);
|
|
|
|
|
2012-12-07 20:43:51 +00:00
|
|
|
ret = pci_enable_device(pdev);
|
|
|
|
if (ret)
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
goto out_power;
|
2012-12-07 20:43:51 +00:00
|
|
|
|
2017-07-26 20:33:15 +00:00
|
|
|
/* If reset fails because of the device lock, fail this path entirely */
|
|
|
|
ret = pci_try_reset_function(pdev);
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (ret == -EAGAIN)
|
|
|
|
goto out_disable_device;
|
2017-07-26 20:33:15 +00:00
|
|
|
|
|
|
|
vdev->reset_works = !ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
pci_save_state(pdev);
|
|
|
|
vdev->pci_saved_state = pci_store_saved_state(pdev);
|
|
|
|
if (!vdev->pci_saved_state)
|
2019-03-30 14:41:35 +00:00
|
|
|
pci_dbg(pdev, "%s: Couldn't store saved state\n", __func__);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
vfio/pci: Hide broken INTx support from user
INTx masking has two components, the first is that we need the ability
to prevent the device from continuing to assert INTx. This is
provided via the DisINTx bit in the command register and is the only
thing we can really probe for when testing if INTx masking is
supported. The second component is that the device needs to indicate
if INTx is asserted via the interrupt status bit in the device status
register. With these two features we can generically determine if one
of the devices we own is asserting INTx, signal the user, and mask the
interrupt while the user services the device.
Generally if one or both of these components is broken we resort to
APIC level interrupt masking, which requires an exclusive interrupt
since we have no way to determine the source of the interrupt in a
shared configuration. This often makes it difficult or impossible to
configure the system for userspace use of the device, for an interrupt
mode that the user may not need.
One possible configuration of broken INTx masking is that the DisINTx
support is fully functional, but the interrupt status bit never
signals interrupt assertion. In this case we do have the ability to
prevent the device from asserting INTx, but lack the ability to
identify the interrupt source. For this case we can simply pretend
that the device lacks INTx support entirely, keeping DisINTx set on
the physical device, virtualizing this bit for the user, and
virtualizing the interrupt pin register to indicate no INTx support.
We already support virtualization of the DisINTx bit and already
virtualize the interrupt pin for platforms without INTx support. By
tying these components together, setting DisINTx on open and reset,
and identifying devices broken in this particular way, we can provide
support for them w/o the handicap of APIC level INTx masking.
Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being
broken in this specific way. We leave the vfio-pci.nointxmask option
as a mechanism to bypass this support, enabling INTx on the device
with all the requirements of APIC level masking.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
2016-03-24 19:05:18 +00:00
|
|
|
if (likely(!nointxmask)) {
|
|
|
|
if (vfio_pci_nointx(pdev)) {
|
2019-03-30 14:41:35 +00:00
|
|
|
pci_info(pdev, "Masking broken INTx support\n");
|
vfio/pci: Hide broken INTx support from user
INTx masking has two components, the first is that we need the ability
to prevent the device from continuing to assert INTx. This is
provided via the DisINTx bit in the command register and is the only
thing we can really probe for when testing if INTx masking is
supported. The second component is that the device needs to indicate
if INTx is asserted via the interrupt status bit in the device status
register. With these two features we can generically determine if one
of the devices we own is asserting INTx, signal the user, and mask the
interrupt while the user services the device.
Generally if one or both of these components is broken we resort to
APIC level interrupt masking, which requires an exclusive interrupt
since we have no way to determine the source of the interrupt in a
shared configuration. This often makes it difficult or impossible to
configure the system for userspace use of the device, for an interrupt
mode that the user may not need.
One possible configuration of broken INTx masking is that the DisINTx
support is fully functional, but the interrupt status bit never
signals interrupt assertion. In this case we do have the ability to
prevent the device from asserting INTx, but lack the ability to
identify the interrupt source. For this case we can simply pretend
that the device lacks INTx support entirely, keeping DisINTx set on
the physical device, virtualizing this bit for the user, and
virtualizing the interrupt pin register to indicate no INTx support.
We already support virtualization of the DisINTx bit and already
virtualize the interrupt pin for platforms without INTx support. By
tying these components together, setting DisINTx on open and reset,
and identifying devices broken in this particular way, we can provide
support for them w/o the handicap of APIC level INTx masking.
Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being
broken in this specific way. We leave the vfio-pci.nointxmask option
as a mechanism to bypass this support, enabling INTx on the device
with all the requirements of APIC level masking.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
2016-03-24 19:05:18 +00:00
|
|
|
vdev->nointx = true;
|
|
|
|
pci_intx(pdev, 0);
|
|
|
|
} else
|
|
|
|
vdev->pci_2_3 = pci_intx_mask_supported(pdev);
|
2012-12-07 20:43:51 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
|
|
|
pci_read_config_word(pdev, PCI_COMMAND, &cmd);
|
|
|
|
if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
|
|
|
|
cmd &= ~PCI_COMMAND_INTX_DISABLE;
|
|
|
|
pci_write_config_word(pdev, PCI_COMMAND, cmd);
|
|
|
|
}
|
|
|
|
|
2022-06-06 20:33:21 +00:00
|
|
|
ret = vfio_pci_zdev_open_device(vdev);
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_free_state;
|
vfio/pci: Hide broken INTx support from user
INTx masking has two components, the first is that we need the ability
to prevent the device from continuing to assert INTx. This is
provided via the DisINTx bit in the command register and is the only
thing we can really probe for when testing if INTx masking is
supported. The second component is that the device needs to indicate
if INTx is asserted via the interrupt status bit in the device status
register. With these two features we can generically determine if one
of the devices we own is asserting INTx, signal the user, and mask the
interrupt while the user services the device.
Generally if one or both of these components is broken we resort to
APIC level interrupt masking, which requires an exclusive interrupt
since we have no way to determine the source of the interrupt in a
shared configuration. This often makes it difficult or impossible to
configure the system for userspace use of the device, for an interrupt
mode that the user may not need.
One possible configuration of broken INTx masking is that the DisINTx
support is fully functional, but the interrupt status bit never
signals interrupt assertion. In this case we do have the ability to
prevent the device from asserting INTx, but lack the ability to
identify the interrupt source. For this case we can simply pretend
that the device lacks INTx support entirely, keeping DisINTx set on
the physical device, virtualizing this bit for the user, and
virtualizing the interrupt pin register to indicate no INTx support.
We already support virtualization of the DisINTx bit and already
virtualize the interrupt pin for platforms without INTx support. By
tying these components together, setting DisINTx on open and reset,
and identifying devices broken in this particular way, we can provide
support for them w/o the handicap of APIC level INTx masking.
Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being
broken in this specific way. We leave the vfio-pci.nointxmask option
as a mechanism to bypass this support, enabling INTx on the device
with all the requirements of APIC level masking.
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: John Ronciak <john.ronciak@intel.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
2016-03-24 19:05:18 +00:00
|
|
|
|
2022-06-06 20:33:21 +00:00
|
|
|
ret = vfio_config_init(vdev);
|
|
|
|
if (ret)
|
|
|
|
goto out_free_zdev;
|
|
|
|
|
2013-04-18 21:12:58 +00:00
|
|
|
msix_pos = pdev->msix_cap;
|
2012-07-31 14:16:24 +00:00
|
|
|
if (msix_pos) {
|
|
|
|
u16 flags;
|
|
|
|
u32 table;
|
|
|
|
|
|
|
|
pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
|
|
|
|
pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
|
|
|
|
|
2013-04-18 18:42:58 +00:00
|
|
|
vdev->msix_bar = table & PCI_MSIX_TABLE_BIR;
|
|
|
|
vdev->msix_offset = table & PCI_MSIX_TABLE_OFFSET;
|
2012-07-31 14:16:24 +00:00
|
|
|
vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
|
2023-05-11 15:44:36 +00:00
|
|
|
vdev->has_dyn_msix = pci_msix_can_alloc_dyn(pdev);
|
|
|
|
} else {
|
2012-07-31 14:16:24 +00:00
|
|
|
vdev->msix_bar = 0xFF;
|
2023-05-11 15:44:36 +00:00
|
|
|
vdev->has_dyn_msix = false;
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2015-04-07 17:14:41 +00:00
|
|
|
if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
|
2013-02-18 17:11:13 +00:00
|
|
|
vdev->has_vga = true;
|
|
|
|
|
2016-06-30 07:21:24 +00:00
|
|
|
|
2012-12-07 20:43:51 +00:00
|
|
|
return 0;
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
|
2022-06-06 20:33:21 +00:00
|
|
|
out_free_zdev:
|
|
|
|
vfio_pci_zdev_close_device(vdev);
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
out_free_state:
|
|
|
|
kfree(vdev->pci_saved_state);
|
|
|
|
vdev->pci_saved_state = NULL;
|
|
|
|
out_disable_device:
|
|
|
|
pci_disable_device(pdev);
|
|
|
|
out_power:
|
|
|
|
if (!disable_idle_d3)
|
|
|
|
pm_runtime_put(&pdev->dev);
|
|
|
|
return ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_enable);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-26 10:39:06 +00:00
|
|
|
void vfio_pci_core_disable(struct vfio_pci_core_device *vdev)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
2012-12-07 20:43:50 +00:00
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
2016-06-30 07:21:24 +00:00
|
|
|
struct vfio_pci_dummy_resource *dummy_res, *tmp;
|
2018-03-21 18:46:21 +00:00
|
|
|
struct vfio_pci_ioeventfd *ioeventfd, *ioeventfd_tmp;
|
2016-02-22 23:02:39 +00:00
|
|
|
int i, bar;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
/* For needs_reset */
|
|
|
|
lockdep_assert_held(&vdev->vdev.dev_set->lock);
|
|
|
|
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
/*
|
|
|
|
* This function can be invoked while the power state is non-D0.
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
* This non-D0 power state can be with or without runtime PM.
|
|
|
|
* vfio_pci_runtime_pm_exit() will internally increment the usage
|
|
|
|
* count corresponding to pm_runtime_put() called during low power
|
|
|
|
* feature entry and then pm_runtime_resume() will wake up the device,
|
|
|
|
* if the device has already gone into the suspended state. Otherwise,
|
|
|
|
* the vfio_pci_set_power_state() will change the device power state
|
|
|
|
* to D0.
|
|
|
|
*/
|
|
|
|
vfio_pci_runtime_pm_exit(vdev);
|
|
|
|
pm_runtime_resume(&pdev->dev);
|
|
|
|
|
|
|
|
/*
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
* This function calls __pci_reset_function_locked() which internally
|
|
|
|
* can use pci_pm_reset() for the function reset. pci_pm_reset() will
|
|
|
|
* fail if the power state is non-D0. Also, for the devices which
|
|
|
|
* have NoSoftRst-, the reset function can cause the PCI config space
|
|
|
|
* reset without restoring the original state (saved locally in
|
|
|
|
* 'vdev->pm_save').
|
|
|
|
*/
|
|
|
|
vfio_pci_set_power_state(vdev, PCI_D0);
|
|
|
|
|
2014-08-07 17:12:02 +00:00
|
|
|
/* Stop the device from further DMA */
|
|
|
|
pci_clear_master(pdev);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
|
|
|
vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
|
|
|
|
VFIO_IRQ_SET_ACTION_TRIGGER,
|
|
|
|
vdev->irq_type, 0, 0, NULL);
|
|
|
|
|
2018-03-21 18:46:21 +00:00
|
|
|
/* Device closed, don't need mutex here */
|
|
|
|
list_for_each_entry_safe(ioeventfd, ioeventfd_tmp,
|
|
|
|
&vdev->ioeventfds_list, next) {
|
|
|
|
vfio_virqfd_disable(&ioeventfd->virqfd);
|
|
|
|
list_del(&ioeventfd->next);
|
|
|
|
kfree(ioeventfd);
|
|
|
|
}
|
|
|
|
vdev->ioeventfds_nr = 0;
|
|
|
|
|
2012-07-31 14:16:24 +00:00
|
|
|
vdev->virq_disabled = false;
|
|
|
|
|
2016-02-22 23:02:39 +00:00
|
|
|
for (i = 0; i < vdev->num_regions; i++)
|
|
|
|
vdev->region[i].ops->release(vdev, &vdev->region[i]);
|
|
|
|
|
|
|
|
vdev->num_regions = 0;
|
|
|
|
kfree(vdev->region);
|
|
|
|
vdev->region = NULL; /* don't krealloc a freed pointer */
|
|
|
|
|
2012-07-31 14:16:24 +00:00
|
|
|
vfio_config_free(vdev);
|
|
|
|
|
2019-09-27 23:43:08 +00:00
|
|
|
for (i = 0; i < PCI_STD_NUM_BARS; i++) {
|
|
|
|
bar = i + PCI_STD_RESOURCES;
|
2012-07-31 14:16:24 +00:00
|
|
|
if (!vdev->barmap[bar])
|
|
|
|
continue;
|
2012-12-07 20:43:50 +00:00
|
|
|
pci_iounmap(pdev, vdev->barmap[bar]);
|
|
|
|
pci_release_selected_regions(pdev, 1 << bar);
|
2012-07-31 14:16:24 +00:00
|
|
|
vdev->barmap[bar] = NULL;
|
|
|
|
}
|
2012-12-07 20:43:50 +00:00
|
|
|
|
2016-06-30 07:21:24 +00:00
|
|
|
list_for_each_entry_safe(dummy_res, tmp,
|
|
|
|
&vdev->dummy_resources_list, res_next) {
|
|
|
|
list_del(&dummy_res->res_next);
|
|
|
|
release_resource(&dummy_res->resource);
|
|
|
|
kfree(dummy_res);
|
|
|
|
}
|
|
|
|
|
2014-08-07 17:12:07 +00:00
|
|
|
vdev->needs_reset = true;
|
|
|
|
|
2022-06-06 20:33:21 +00:00
|
|
|
vfio_pci_zdev_close_device(vdev);
|
|
|
|
|
2012-12-07 20:43:50 +00:00
|
|
|
/*
|
|
|
|
* If we have saved state, restore it. If we can reset the device,
|
|
|
|
* even better. Resetting with current state seems better than
|
|
|
|
* nothing, but saving and restoring current state without reset
|
|
|
|
* is just busy work.
|
|
|
|
*/
|
|
|
|
if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
|
2019-03-30 14:41:35 +00:00
|
|
|
pci_info(pdev, "%s: Couldn't reload saved state\n", __func__);
|
2012-12-07 20:43:50 +00:00
|
|
|
|
|
|
|
if (!vdev->reset_works)
|
2014-08-07 17:12:02 +00:00
|
|
|
goto out;
|
2012-12-07 20:43:50 +00:00
|
|
|
|
|
|
|
pci_save_state(pdev);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Disable INTx and MSI, presumably to avoid spurious interrupts
|
|
|
|
* during reset. Stolen from pci_reset_function()
|
|
|
|
*/
|
|
|
|
pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
|
|
|
|
|
2013-06-10 22:40:57 +00:00
|
|
|
/*
|
2019-08-22 03:35:19 +00:00
|
|
|
* Try to get the locks ourselves to prevent a deadlock. The
|
|
|
|
* success of this is dependent on being able to lock the device,
|
|
|
|
* which is not always possible.
|
|
|
|
* We can not use the "try" reset interface here, which will
|
|
|
|
* overwrite the previously restored configuration information.
|
2013-06-10 22:40:57 +00:00
|
|
|
*/
|
2021-06-23 02:28:24 +00:00
|
|
|
if (vdev->reset_works && pci_dev_trylock(pdev)) {
|
|
|
|
if (!__pci_reset_function_locked(pdev))
|
|
|
|
vdev->needs_reset = false;
|
|
|
|
pci_dev_unlock(pdev);
|
2019-08-22 03:35:19 +00:00
|
|
|
}
|
2012-12-07 20:43:50 +00:00
|
|
|
|
|
|
|
pci_restore_state(pdev);
|
2014-08-07 17:12:02 +00:00
|
|
|
out:
|
|
|
|
pci_disable_device(pdev);
|
2014-08-07 17:12:07 +00:00
|
|
|
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
vfio_pci_dev_set_try_reset(vdev->vdev.dev_set);
|
|
|
|
|
|
|
|
/* Put the pm-runtime usage counter acquired during enable */
|
|
|
|
if (!disable_idle_d3)
|
|
|
|
pm_runtime_put(&pdev->dev);
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_disable);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
void vfio_pci_core_close_device(struct vfio_device *core_vdev)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-04-13 13:10:36 +00:00
|
|
|
if (vdev->sriov_pf_core_dev) {
|
|
|
|
mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
|
|
|
|
WARN_ON(!vdev->sriov_pf_core_dev->vf_token->users);
|
|
|
|
vdev->sriov_pf_core_dev->vf_token->users--;
|
|
|
|
mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
|
|
|
|
}
|
2022-12-05 15:29:19 +00:00
|
|
|
#if IS_ENABLED(CONFIG_EEH)
|
2022-12-05 15:29:16 +00:00
|
|
|
eeh_dev_release(vdev->pdev);
|
|
|
|
#endif
|
2021-08-26 10:39:06 +00:00
|
|
|
vfio_pci_core_disable(vdev);
|
2020-07-27 19:43:38 +00:00
|
|
|
|
2021-08-06 01:19:04 +00:00
|
|
|
mutex_lock(&vdev->igate);
|
|
|
|
if (vdev->err_trigger) {
|
|
|
|
eventfd_ctx_put(vdev->err_trigger);
|
|
|
|
vdev->err_trigger = NULL;
|
2014-06-10 01:41:57 +00:00
|
|
|
}
|
2021-08-06 01:19:04 +00:00
|
|
|
if (vdev->req_trigger) {
|
|
|
|
eventfd_ctx_put(vdev->req_trigger);
|
|
|
|
vdev->req_trigger = NULL;
|
|
|
|
}
|
|
|
|
mutex_unlock(&vdev->igate);
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_close_device);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-26 10:39:06 +00:00
|
|
|
void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
2021-08-26 10:39:06 +00:00
|
|
|
vfio_pci_probe_mmaps(vdev);
|
2022-12-05 15:29:19 +00:00
|
|
|
#if IS_ENABLED(CONFIG_EEH)
|
2022-12-05 15:29:16 +00:00
|
|
|
eeh_dev_open(vdev->pdev);
|
|
|
|
#endif
|
2022-04-13 13:10:36 +00:00
|
|
|
|
|
|
|
if (vdev->sriov_pf_core_dev) {
|
|
|
|
mutex_lock(&vdev->sriov_pf_core_dev->vf_token->lock);
|
|
|
|
vdev->sriov_pf_core_dev->vf_token->users++;
|
|
|
|
mutex_unlock(&vdev->sriov_pf_core_dev->vf_token->lock);
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
|
|
|
if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
|
|
|
|
u8 pin;
|
2018-09-25 19:01:27 +00:00
|
|
|
|
|
|
|
if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
|
|
|
|
vdev->nointx || vdev->pdev->is_virtfn)
|
|
|
|
return 0;
|
|
|
|
|
2012-07-31 14:16:24 +00:00
|
|
|
pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
|
|
|
|
|
2018-09-25 19:01:27 +00:00
|
|
|
return pin ? 1 : 0;
|
2012-07-31 14:16:24 +00:00
|
|
|
} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
|
|
|
|
u8 pos;
|
|
|
|
u16 flags;
|
|
|
|
|
2013-04-18 21:12:58 +00:00
|
|
|
pos = vdev->pdev->msi_cap;
|
2012-07-31 14:16:24 +00:00
|
|
|
if (pos) {
|
|
|
|
pci_read_config_word(vdev->pdev,
|
|
|
|
pos + PCI_MSI_FLAGS, &flags);
|
2014-05-30 17:35:54 +00:00
|
|
|
return 1 << ((flags & PCI_MSI_FLAGS_QMASK) >> 1);
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
|
|
|
} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
|
|
|
|
u8 pos;
|
|
|
|
u16 flags;
|
|
|
|
|
2013-04-18 21:12:58 +00:00
|
|
|
pos = vdev->pdev->msix_cap;
|
2012-07-31 14:16:24 +00:00
|
|
|
if (pos) {
|
|
|
|
pci_read_config_word(vdev->pdev,
|
|
|
|
pos + PCI_MSIX_FLAGS, &flags);
|
|
|
|
|
|
|
|
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
|
|
|
|
}
|
2015-02-06 22:05:08 +00:00
|
|
|
} else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) {
|
2013-03-11 15:31:22 +00:00
|
|
|
if (pci_is_pcie(vdev->pdev))
|
|
|
|
return 1;
|
2015-02-06 22:05:08 +00:00
|
|
|
} else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) {
|
|
|
|
return 1;
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-09-04 17:28:04 +00:00
|
|
|
static int vfio_pci_count_devs(struct pci_dev *pdev, void *data)
|
|
|
|
{
|
|
|
|
(*(int *)data)++;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct vfio_pci_fill_info {
|
2023-07-18 10:55:40 +00:00
|
|
|
struct vfio_device *vdev;
|
2024-05-03 14:31:36 +00:00
|
|
|
struct vfio_pci_dependent_device *devices;
|
|
|
|
int nr_devices;
|
2023-07-18 10:55:41 +00:00
|
|
|
u32 count;
|
2023-07-18 10:55:40 +00:00
|
|
|
u32 flags;
|
2013-09-04 17:28:04 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data)
|
|
|
|
{
|
2024-05-03 14:31:36 +00:00
|
|
|
struct vfio_pci_dependent_device *info;
|
2013-09-04 17:28:04 +00:00
|
|
|
struct vfio_pci_fill_info *fill = data;
|
|
|
|
|
2024-05-03 14:31:36 +00:00
|
|
|
/* The topology changed since we counted devices */
|
|
|
|
if (fill->count >= fill->nr_devices)
|
|
|
|
return -EAGAIN;
|
|
|
|
|
|
|
|
info = &fill->devices[fill->count++];
|
|
|
|
info->segment = pci_domain_nr(pdev->bus);
|
|
|
|
info->bus = pdev->bus->number;
|
|
|
|
info->devfn = pdev->devfn;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:40 +00:00
|
|
|
if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) {
|
|
|
|
struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev);
|
|
|
|
struct vfio_device_set *dev_set = fill->vdev->dev_set;
|
|
|
|
struct vfio_device *vdev;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:40 +00:00
|
|
|
/*
|
|
|
|
* hot-reset requires all affected devices be represented in
|
|
|
|
* the dev_set.
|
|
|
|
*/
|
|
|
|
vdev = vfio_find_device_in_devset(dev_set, &pdev->dev);
|
|
|
|
if (!vdev) {
|
2024-05-03 14:31:36 +00:00
|
|
|
info->devid = VFIO_PCI_DEVID_NOT_OWNED;
|
2023-07-18 10:55:40 +00:00
|
|
|
} else {
|
|
|
|
int id = vfio_iommufd_get_dev_id(vdev, iommufd);
|
|
|
|
|
|
|
|
if (id > 0)
|
2024-05-03 14:31:36 +00:00
|
|
|
info->devid = id;
|
2023-07-18 10:55:40 +00:00
|
|
|
else if (id == -ENOENT)
|
2024-05-03 14:31:36 +00:00
|
|
|
info->devid = VFIO_PCI_DEVID_OWNED;
|
2023-07-18 10:55:40 +00:00
|
|
|
else
|
2024-05-03 14:31:36 +00:00
|
|
|
info->devid = VFIO_PCI_DEVID_NOT_OWNED;
|
2023-07-18 10:55:40 +00:00
|
|
|
}
|
|
|
|
/* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */
|
2024-05-03 14:31:36 +00:00
|
|
|
if (info->devid == VFIO_PCI_DEVID_NOT_OWNED)
|
2023-07-18 10:55:40 +00:00
|
|
|
fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
|
|
|
|
} else {
|
|
|
|
struct iommu_group *iommu_group;
|
|
|
|
|
|
|
|
iommu_group = iommu_group_get(&pdev->dev);
|
|
|
|
if (!iommu_group)
|
|
|
|
return -EPERM; /* Cannot reset non-isolated devices */
|
|
|
|
|
2024-05-03 14:31:36 +00:00
|
|
|
info->group_id = iommu_group_id(iommu_group);
|
2023-07-18 10:55:40 +00:00
|
|
|
iommu_group_put(iommu_group);
|
|
|
|
}
|
2023-07-18 10:55:41 +00:00
|
|
|
|
2013-09-04 17:28:04 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct vfio_pci_group_info {
|
|
|
|
int count;
|
2022-05-04 19:14:46 +00:00
|
|
|
struct file **files;
|
2013-09-04 17:28:04 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static bool vfio_pci_dev_below_slot(struct pci_dev *pdev, struct pci_slot *slot)
|
|
|
|
{
|
|
|
|
for (; pdev; pdev = pdev->bus->self)
|
|
|
|
if (pdev->bus == slot->bus)
|
|
|
|
return (pdev->slot == slot);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct vfio_pci_walk_info {
|
2021-09-02 21:26:31 +00:00
|
|
|
int (*fn)(struct pci_dev *pdev, void *data);
|
2013-09-04 17:28:04 +00:00
|
|
|
void *data;
|
|
|
|
struct pci_dev *pdev;
|
|
|
|
bool slot;
|
|
|
|
int ret;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int vfio_pci_walk_wrapper(struct pci_dev *pdev, void *data)
|
|
|
|
{
|
|
|
|
struct vfio_pci_walk_info *walk = data;
|
|
|
|
|
|
|
|
if (!walk->slot || vfio_pci_dev_below_slot(pdev, walk->pdev->slot))
|
|
|
|
walk->ret = walk->fn(pdev, walk->data);
|
|
|
|
|
|
|
|
return walk->ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev,
|
|
|
|
int (*fn)(struct pci_dev *,
|
|
|
|
void *data), void *data,
|
|
|
|
bool slot)
|
|
|
|
{
|
|
|
|
struct vfio_pci_walk_info walk = {
|
|
|
|
.fn = fn, .data = data, .pdev = pdev, .slot = slot, .ret = 0,
|
|
|
|
};
|
|
|
|
|
|
|
|
pci_walk_bus(pdev->bus, vfio_pci_walk_wrapper, &walk);
|
|
|
|
|
|
|
|
return walk.ret;
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static int msix_mmappable_cap(struct vfio_pci_core_device *vdev,
|
2017-12-13 02:31:31 +00:00
|
|
|
struct vfio_info_cap *caps)
|
2016-02-22 23:02:36 +00:00
|
|
|
{
|
2017-12-13 02:31:31 +00:00
|
|
|
struct vfio_info_cap_header header = {
|
|
|
|
.id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE,
|
|
|
|
.version = 1
|
|
|
|
};
|
2016-02-22 23:02:39 +00:00
|
|
|
|
2017-12-13 02:31:31 +00:00
|
|
|
return vfio_info_add_capability(caps, &header, sizeof(header));
|
2016-02-22 23:02:39 +00:00
|
|
|
}
|
|
|
|
|
2022-08-26 19:34:02 +00:00
|
|
|
int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
|
|
|
|
unsigned int type, unsigned int subtype,
|
|
|
|
const struct vfio_pci_regops *ops,
|
|
|
|
size_t size, u32 flags, void *data)
|
2016-02-22 23:02:39 +00:00
|
|
|
{
|
|
|
|
struct vfio_pci_region *region;
|
|
|
|
|
|
|
|
region = krealloc(vdev->region,
|
|
|
|
(vdev->num_regions + 1) * sizeof(*region),
|
2023-01-08 15:44:24 +00:00
|
|
|
GFP_KERNEL_ACCOUNT);
|
2016-02-22 23:02:39 +00:00
|
|
|
if (!region)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
vdev->region = region;
|
|
|
|
vdev->region[vdev->num_regions].type = type;
|
|
|
|
vdev->region[vdev->num_regions].subtype = subtype;
|
|
|
|
vdev->region[vdev->num_regions].ops = ops;
|
|
|
|
vdev->region[vdev->num_regions].size = size;
|
|
|
|
vdev->region[vdev->num_regions].flags = flags;
|
|
|
|
vdev->region[vdev->num_regions].data = data;
|
|
|
|
|
|
|
|
vdev->num_regions++;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2022-08-26 19:34:02 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_register_dev_region);
|
2016-02-22 23:02:39 +00:00
|
|
|
|
2023-05-19 21:47:48 +00:00
|
|
|
static int vfio_pci_info_atomic_cap(struct vfio_pci_core_device *vdev,
|
|
|
|
struct vfio_info_cap *caps)
|
|
|
|
{
|
|
|
|
struct vfio_device_info_cap_pci_atomic_comp cap = {
|
|
|
|
.header.id = VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP,
|
|
|
|
.header.version = 1
|
|
|
|
};
|
|
|
|
struct pci_dev *pdev = pci_physfn(vdev->pdev);
|
|
|
|
u32 devcap2;
|
|
|
|
|
|
|
|
pcie_capability_read_dword(pdev, PCI_EXP_DEVCAP2, &devcap2);
|
|
|
|
|
|
|
|
if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP32) &&
|
|
|
|
!pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32))
|
|
|
|
cap.flags |= VFIO_PCI_ATOMIC_COMP32;
|
|
|
|
|
|
|
|
if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP64) &&
|
|
|
|
!pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64))
|
|
|
|
cap.flags |= VFIO_PCI_ATOMIC_COMP64;
|
|
|
|
|
|
|
|
if ((devcap2 & PCI_EXP_DEVCAP2_ATOMIC_COMP128) &&
|
|
|
|
!pci_enable_atomic_ops_to_root(pdev,
|
|
|
|
PCI_EXP_DEVCAP2_ATOMIC_COMP128))
|
|
|
|
cap.flags |= VFIO_PCI_ATOMIC_COMP128;
|
|
|
|
|
|
|
|
if (!cap.flags)
|
|
|
|
return -ENODEV;
|
|
|
|
|
|
|
|
return vfio_info_add_capability(caps, &cap.header, sizeof(cap));
|
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:57 +00:00
|
|
|
static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev,
|
2022-08-31 20:15:59 +00:00
|
|
|
struct vfio_device_info __user *arg)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
2022-08-31 20:15:57 +00:00
|
|
|
unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs);
|
2023-08-09 20:31:44 +00:00
|
|
|
struct vfio_device_info info = {};
|
2022-08-31 20:15:58 +00:00
|
|
|
struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
|
|
|
|
int ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_from_user(&info, arg, minsz))
|
2022-08-31 20:15:58 +00:00
|
|
|
return -EFAULT;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (info.argsz < minsz)
|
|
|
|
return -EINVAL;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2023-08-09 20:31:44 +00:00
|
|
|
minsz = min_t(size_t, info.argsz, sizeof(info));
|
2020-10-07 18:56:23 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
info.flags = VFIO_DEVICE_FLAGS_PCI;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (vdev->reset_works)
|
|
|
|
info.flags |= VFIO_DEVICE_FLAGS_RESET;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
info.num_regions = VFIO_PCI_NUM_REGIONS + vdev->num_regions;
|
|
|
|
info.num_irqs = VFIO_PCI_NUM_IRQS;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
ret = vfio_pci_info_zdev_add_caps(vdev, &caps);
|
|
|
|
if (ret && ret != -ENODEV) {
|
|
|
|
pci_warn(vdev->pdev,
|
|
|
|
"Failed to setup zPCI info capabilities\n");
|
|
|
|
return ret;
|
|
|
|
}
|
2020-10-07 18:56:23 +00:00
|
|
|
|
2023-05-19 21:47:48 +00:00
|
|
|
ret = vfio_pci_info_atomic_cap(vdev, &caps);
|
|
|
|
if (ret && ret != -ENODEV) {
|
|
|
|
pci_warn(vdev->pdev,
|
|
|
|
"Failed to setup AtomicOps info capability\n");
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (caps.size) {
|
|
|
|
info.flags |= VFIO_DEVICE_FLAGS_CAPS;
|
|
|
|
if (info.argsz < sizeof(info) + caps.size) {
|
|
|
|
info.argsz = sizeof(info) + caps.size;
|
|
|
|
} else {
|
|
|
|
vfio_info_cap_shift(&caps, sizeof(info));
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_to_user(arg + 1, caps.buf, caps.size)) {
|
2022-08-31 20:15:58 +00:00
|
|
|
kfree(caps.buf);
|
|
|
|
return -EFAULT;
|
2020-10-07 18:56:23 +00:00
|
|
|
}
|
2022-08-31 20:15:59 +00:00
|
|
|
info.cap_offset = sizeof(*arg);
|
2020-10-07 18:56:23 +00:00
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
kfree(caps.buf);
|
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:57 +00:00
|
|
|
static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
|
2022-08-31 20:15:59 +00:00
|
|
|
struct vfio_region_info __user *arg)
|
2022-08-31 20:15:57 +00:00
|
|
|
{
|
|
|
|
unsigned long minsz = offsetofend(struct vfio_region_info, offset);
|
2022-08-31 20:15:58 +00:00
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
|
|
struct vfio_region_info info;
|
|
|
|
struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
|
|
|
|
int i, ret;
|
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_from_user(&info, arg, minsz))
|
2022-08-31 20:15:58 +00:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
if (info.argsz < minsz)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
switch (info.index) {
|
|
|
|
case VFIO_PCI_CONFIG_REGION_INDEX:
|
|
|
|
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
|
|
|
info.size = pdev->cfg_size;
|
|
|
|
info.flags = VFIO_REGION_INFO_FLAG_READ |
|
|
|
|
VFIO_REGION_INFO_FLAG_WRITE;
|
|
|
|
break;
|
|
|
|
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
|
|
|
|
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
|
|
|
info.size = pci_resource_len(pdev, info.index);
|
|
|
|
if (!info.size) {
|
|
|
|
info.flags = 0;
|
2012-07-31 14:16:24 +00:00
|
|
|
break;
|
2022-08-31 20:15:58 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
info.flags = VFIO_REGION_INFO_FLAG_READ |
|
|
|
|
VFIO_REGION_INFO_FLAG_WRITE;
|
|
|
|
if (vdev->bar_mmap_supported[info.index]) {
|
|
|
|
info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
|
|
|
|
if (info.index == vdev->msix_bar) {
|
|
|
|
ret = msix_mmappable_cap(vdev, &caps);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2016-02-22 23:02:36 +00:00
|
|
|
}
|
2022-08-31 20:15:58 +00:00
|
|
|
}
|
2016-02-22 23:02:36 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
break;
|
|
|
|
case VFIO_PCI_ROM_REGION_INDEX: {
|
|
|
|
void __iomem *io;
|
|
|
|
size_t size;
|
|
|
|
u16 cmd;
|
|
|
|
|
|
|
|
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
|
|
|
info.flags = 0;
|
|
|
|
|
|
|
|
/* Report the BAR size, not the ROM size */
|
|
|
|
info.size = pci_resource_len(pdev, info.index);
|
|
|
|
if (!info.size) {
|
|
|
|
/* Shadow ROMs appear as PCI option ROMs */
|
|
|
|
if (pdev->resource[PCI_ROM_RESOURCE].flags &
|
|
|
|
IORESOURCE_ROM_SHADOW)
|
|
|
|
info.size = 0x20000;
|
|
|
|
else
|
|
|
|
break;
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
/*
|
|
|
|
* Is it really there? Enable memory decode for implicit access
|
|
|
|
* in pci_map_rom().
|
|
|
|
*/
|
|
|
|
cmd = vfio_pci_memory_lock_and_enable(vdev);
|
|
|
|
io = pci_map_rom(pdev, &size);
|
|
|
|
if (io) {
|
|
|
|
info.flags = VFIO_REGION_INFO_FLAG_READ;
|
|
|
|
pci_unmap_rom(pdev, io);
|
|
|
|
} else {
|
|
|
|
info.size = 0;
|
|
|
|
}
|
|
|
|
vfio_pci_memory_unlock_and_restore(vdev, cmd);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case VFIO_PCI_VGA_REGION_INDEX:
|
|
|
|
if (!vdev->has_vga)
|
|
|
|
return -EINVAL;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
|
|
|
info.size = 0xc0000;
|
|
|
|
info.flags = VFIO_REGION_INFO_FLAG_READ |
|
|
|
|
VFIO_REGION_INFO_FLAG_WRITE;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
break;
|
|
|
|
default: {
|
|
|
|
struct vfio_region_info_cap_type cap_type = {
|
|
|
|
.header.id = VFIO_REGION_INFO_CAP_TYPE,
|
|
|
|
.header.version = 1
|
|
|
|
};
|
2013-02-18 17:11:13 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
|
|
|
|
return -EINVAL;
|
|
|
|
info.index = array_index_nospec(
|
|
|
|
info.index, VFIO_PCI_NUM_REGIONS + vdev->num_regions);
|
2016-11-16 20:46:26 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
i = info.index - VFIO_PCI_NUM_REGIONS;
|
2016-02-22 23:02:39 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
|
|
|
|
info.size = vdev->region[i].size;
|
|
|
|
info.flags = vdev->region[i].flags;
|
2016-02-22 23:02:39 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
cap_type.type = vdev->region[i].type;
|
|
|
|
cap_type.subtype = vdev->region[i].subtype;
|
2016-02-22 23:02:39 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
ret = vfio_info_add_capability(&caps, &cap_type.header,
|
|
|
|
sizeof(cap_type));
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2016-11-16 20:46:26 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (vdev->region[i].ops->add_capability) {
|
|
|
|
ret = vdev->region[i].ops->add_capability(
|
|
|
|
vdev, &vdev->region[i], &caps);
|
2016-02-22 23:02:39 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2022-08-31 20:15:58 +00:00
|
|
|
}
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (caps.size) {
|
|
|
|
info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
|
|
|
|
if (info.argsz < sizeof(info) + caps.size) {
|
|
|
|
info.argsz = sizeof(info) + caps.size;
|
|
|
|
info.cap_offset = 0;
|
|
|
|
} else {
|
|
|
|
vfio_info_cap_shift(&caps, sizeof(info));
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_to_user(arg + 1, caps.buf, caps.size)) {
|
2022-08-31 20:15:58 +00:00
|
|
|
kfree(caps.buf);
|
|
|
|
return -EFAULT;
|
2016-02-22 23:02:36 +00:00
|
|
|
}
|
2022-08-31 20:15:59 +00:00
|
|
|
info.cap_offset = sizeof(*arg);
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
kfree(caps.buf);
|
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:57 +00:00
|
|
|
static int vfio_pci_ioctl_get_irq_info(struct vfio_pci_core_device *vdev,
|
2022-08-31 20:15:59 +00:00
|
|
|
struct vfio_irq_info __user *arg)
|
2022-08-31 20:15:57 +00:00
|
|
|
{
|
|
|
|
unsigned long minsz = offsetofend(struct vfio_irq_info, count);
|
2022-08-31 20:15:58 +00:00
|
|
|
struct vfio_irq_info info;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_from_user(&info, arg, minsz))
|
2022-08-31 20:15:58 +00:00
|
|
|
return -EFAULT;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
|
|
|
|
return -EINVAL;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
switch (info.index) {
|
|
|
|
case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
|
|
|
|
case VFIO_PCI_REQ_IRQ_INDEX:
|
|
|
|
break;
|
|
|
|
case VFIO_PCI_ERR_IRQ_INDEX:
|
|
|
|
if (pci_is_pcie(vdev->pdev))
|
2013-03-11 15:31:22 +00:00
|
|
|
break;
|
2022-08-31 20:15:58 +00:00
|
|
|
fallthrough;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2013-03-11 15:31:22 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
info.flags = VFIO_IRQ_INFO_EVENTFD;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
info.count = vfio_pci_get_irq_count(vdev, info.index);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
|
|
|
|
info.flags |=
|
|
|
|
(VFIO_IRQ_INFO_MASKABLE | VFIO_IRQ_INFO_AUTOMASKED);
|
2023-05-11 15:44:38 +00:00
|
|
|
else if (info.index != VFIO_PCI_MSIX_IRQ_INDEX || !vdev->has_dyn_msix)
|
2022-08-31 20:15:58 +00:00
|
|
|
info.flags |= VFIO_IRQ_INFO_NORESIZE;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
return copy_to_user(arg, &info, minsz) ? -EFAULT : 0;
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:57 +00:00
|
|
|
static int vfio_pci_ioctl_set_irqs(struct vfio_pci_core_device *vdev,
|
2022-08-31 20:15:59 +00:00
|
|
|
struct vfio_irq_set __user *arg)
|
2022-08-31 20:15:57 +00:00
|
|
|
{
|
|
|
|
unsigned long minsz = offsetofend(struct vfio_irq_set, count);
|
2022-08-31 20:15:58 +00:00
|
|
|
struct vfio_irq_set hdr;
|
|
|
|
u8 *data = NULL;
|
|
|
|
int max, ret = 0;
|
|
|
|
size_t data_size = 0;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_from_user(&hdr, arg, minsz))
|
2022-08-31 20:15:58 +00:00
|
|
|
return -EFAULT;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
max = vfio_pci_get_irq_count(vdev, hdr.index);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
ret = vfio_set_irqs_validate_and_prepare(&hdr, max, VFIO_PCI_NUM_IRQS,
|
|
|
|
&data_size);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (data_size) {
|
2022-08-31 20:15:59 +00:00
|
|
|
data = memdup_user(&arg->data, data_size);
|
2022-08-31 20:15:58 +00:00
|
|
|
if (IS_ERR(data))
|
|
|
|
return PTR_ERR(data);
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
mutex_lock(&vdev->igate);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index, hdr.start,
|
|
|
|
hdr.count, data);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
mutex_unlock(&vdev->igate);
|
|
|
|
kfree(data);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
return ret;
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:57 +00:00
|
|
|
static int vfio_pci_ioctl_reset(struct vfio_pci_core_device *vdev,
|
|
|
|
void __user *arg)
|
|
|
|
{
|
2022-08-31 20:15:58 +00:00
|
|
|
int ret;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (!vdev->reset_works)
|
|
|
|
return -EINVAL;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
vfio_pci_zap_and_down_write_memory_lock(vdev);
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
/*
|
|
|
|
* This function can be invoked while the power state is non-D0. If
|
|
|
|
* pci_try_reset_function() has been called while the power state is
|
|
|
|
* non-D0, then pci_try_reset_function() will internally set the power
|
|
|
|
* state to D0 without vfio driver involvement. For the devices which
|
|
|
|
* have NoSoftRst-, the reset function can cause the PCI config space
|
|
|
|
* reset without restoring the original state (saved locally in
|
|
|
|
* 'vdev->pm_save').
|
|
|
|
*/
|
|
|
|
vfio_pci_set_power_state(vdev, PCI_D0);
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
ret = pci_try_reset_function(vdev->pdev);
|
|
|
|
up_write(&vdev->memory_lock);
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
return ret;
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
static int vfio_pci_ioctl_get_pci_hot_reset_info(
|
|
|
|
struct vfio_pci_core_device *vdev,
|
|
|
|
struct vfio_pci_hot_reset_info __user *arg)
|
2022-08-31 20:15:57 +00:00
|
|
|
{
|
|
|
|
unsigned long minsz =
|
|
|
|
offsetofend(struct vfio_pci_hot_reset_info, count);
|
2024-05-03 14:31:36 +00:00
|
|
|
struct vfio_pci_dependent_device *devices = NULL;
|
2022-08-31 20:15:58 +00:00
|
|
|
struct vfio_pci_hot_reset_info hdr;
|
2023-07-18 10:55:41 +00:00
|
|
|
struct vfio_pci_fill_info fill = {};
|
2022-08-31 20:15:58 +00:00
|
|
|
bool slot = false;
|
2024-07-10 00:41:50 +00:00
|
|
|
int ret, count = 0;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_from_user(&hdr, arg, minsz))
|
2022-08-31 20:15:58 +00:00
|
|
|
return -EFAULT;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (hdr.argsz < minsz)
|
|
|
|
return -EINVAL;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
hdr.flags = 0;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
/* Can we do a slot or bus reset or neither? */
|
|
|
|
if (!pci_probe_reset_slot(vdev->pdev->slot))
|
|
|
|
slot = true;
|
|
|
|
else if (pci_probe_reset_bus(vdev->pdev->bus))
|
|
|
|
return -ENODEV;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2024-05-03 14:31:36 +00:00
|
|
|
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
|
|
|
|
&count, slot);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2024-05-16 17:48:30 +00:00
|
|
|
if (WARN_ON(!count)) /* Should always be at least one */
|
|
|
|
return -ERANGE;
|
|
|
|
|
2024-05-03 14:31:36 +00:00
|
|
|
if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) {
|
|
|
|
hdr.count = count;
|
|
|
|
ret = -ENOSPC;
|
|
|
|
goto header;
|
|
|
|
}
|
|
|
|
|
|
|
|
devices = kcalloc(count, sizeof(*devices), GFP_KERNEL);
|
|
|
|
if (!devices)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
fill.devices = devices;
|
|
|
|
fill.nr_devices = count;
|
2023-07-18 10:55:40 +00:00
|
|
|
fill.vdev = &vdev->vdev;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:40 +00:00
|
|
|
if (vfio_device_cdev_opened(&vdev->vdev))
|
|
|
|
fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID |
|
|
|
|
VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED;
|
|
|
|
|
|
|
|
mutex_lock(&vdev->vdev.dev_set->lock);
|
2022-08-31 20:15:58 +00:00
|
|
|
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs,
|
|
|
|
&fill, slot);
|
2023-07-18 10:55:40 +00:00
|
|
|
mutex_unlock(&vdev->vdev.dev_set->lock);
|
2023-07-18 10:55:41 +00:00
|
|
|
if (ret)
|
2024-05-03 14:31:36 +00:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (copy_to_user(arg->devices, devices,
|
|
|
|
sizeof(*devices) * fill.count)) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:41 +00:00
|
|
|
hdr.count = fill.count;
|
|
|
|
hdr.flags = fill.flags;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2024-05-03 14:31:36 +00:00
|
|
|
header:
|
|
|
|
if (copy_to_user(arg, &hdr, minsz))
|
|
|
|
ret = -EFAULT;
|
|
|
|
out:
|
|
|
|
kfree(devices);
|
|
|
|
return ret;
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:34 +00:00
|
|
|
static int
|
|
|
|
vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev,
|
|
|
|
int array_count, bool slot,
|
|
|
|
struct vfio_pci_hot_reset __user *arg)
|
2022-08-31 20:15:57 +00:00
|
|
|
{
|
2022-08-31 20:15:58 +00:00
|
|
|
int32_t *group_fds;
|
|
|
|
struct file **files;
|
|
|
|
struct vfio_pci_group_info info;
|
|
|
|
int file_idx, count = 0, ret = 0;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
/*
|
|
|
|
* We can't let userspace give us an arbitrarily large buffer to copy,
|
|
|
|
* so verify how many we think there could be. Note groups can have
|
|
|
|
* multiple devices so one group per device is the max.
|
|
|
|
*/
|
|
|
|
ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
|
|
|
|
&count, slot);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:42 +00:00
|
|
|
if (array_count > count)
|
2022-08-31 20:15:58 +00:00
|
|
|
return -EINVAL;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:34 +00:00
|
|
|
group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL);
|
|
|
|
files = kcalloc(array_count, sizeof(*files), GFP_KERNEL);
|
2022-08-31 20:15:58 +00:00
|
|
|
if (!group_fds || !files) {
|
|
|
|
kfree(group_fds);
|
|
|
|
kfree(files);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_from_user(group_fds, arg->group_fds,
|
2023-07-18 10:55:34 +00:00
|
|
|
array_count * sizeof(*group_fds))) {
|
2022-08-31 20:15:58 +00:00
|
|
|
kfree(group_fds);
|
|
|
|
kfree(files);
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
2022-05-04 19:14:46 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
/*
|
2023-07-18 10:55:33 +00:00
|
|
|
* Get the group file for each fd to ensure the group is held across
|
|
|
|
* the reset
|
2022-08-31 20:15:58 +00:00
|
|
|
*/
|
2023-07-18 10:55:34 +00:00
|
|
|
for (file_idx = 0; file_idx < array_count; file_idx++) {
|
2022-08-31 20:15:58 +00:00
|
|
|
struct file *file = fget(group_fds[file_idx]);
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (!file) {
|
|
|
|
ret = -EBADF;
|
|
|
|
break;
|
|
|
|
}
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
/* Ensure the FD is a vfio group FD.*/
|
2022-10-07 14:04:39 +00:00
|
|
|
if (!vfio_file_is_group(file)) {
|
2022-08-31 20:15:58 +00:00
|
|
|
fput(file);
|
|
|
|
ret = -EINVAL;
|
|
|
|
break;
|
2013-09-04 17:28:04 +00:00
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
files[file_idx] = file;
|
|
|
|
}
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
kfree(group_fds);
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
/* release reference to groups on error */
|
|
|
|
if (ret)
|
|
|
|
goto hot_reset_release;
|
|
|
|
|
2023-07-18 10:55:34 +00:00
|
|
|
info.count = array_count;
|
2022-08-31 20:15:58 +00:00
|
|
|
info.files = files;
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2023-07-18 10:55:42 +00:00
|
|
|
ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL);
|
2013-09-04 17:28:04 +00:00
|
|
|
|
|
|
|
hot_reset_release:
|
2022-08-31 20:15:58 +00:00
|
|
|
for (file_idx--; file_idx >= 0; file_idx--)
|
|
|
|
fput(files[file_idx]);
|
2013-09-04 17:28:04 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
kfree(files);
|
|
|
|
return ret;
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
|
|
|
|
2023-07-18 10:55:34 +00:00
|
|
|
static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev,
|
|
|
|
struct vfio_pci_hot_reset __user *arg)
|
|
|
|
{
|
|
|
|
unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count);
|
|
|
|
struct vfio_pci_hot_reset hdr;
|
|
|
|
bool slot = false;
|
|
|
|
|
|
|
|
if (copy_from_user(&hdr, arg, minsz))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
if (hdr.argsz < minsz || hdr.flags)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2023-07-18 10:55:42 +00:00
|
|
|
/* zero-length array is only for cdev opened devices */
|
|
|
|
if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2023-07-18 10:55:34 +00:00
|
|
|
/* Can we do a slot or bus reset or neither? */
|
|
|
|
if (!pci_probe_reset_slot(vdev->pdev->slot))
|
|
|
|
slot = true;
|
|
|
|
else if (pci_probe_reset_bus(vdev->pdev->bus))
|
|
|
|
return -ENODEV;
|
|
|
|
|
2023-07-18 10:55:42 +00:00
|
|
|
if (hdr.count)
|
|
|
|
return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg);
|
|
|
|
|
|
|
|
return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL,
|
|
|
|
vfio_iommufd_device_ictx(&vdev->vdev));
|
2023-07-18 10:55:34 +00:00
|
|
|
}
|
|
|
|
|
2022-08-31 20:15:57 +00:00
|
|
|
static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev,
|
2022-08-31 20:15:59 +00:00
|
|
|
struct vfio_device_ioeventfd __user *arg)
|
2022-08-31 20:15:57 +00:00
|
|
|
{
|
|
|
|
unsigned long minsz = offsetofend(struct vfio_device_ioeventfd, fd);
|
2022-08-31 20:15:58 +00:00
|
|
|
struct vfio_device_ioeventfd ioeventfd;
|
|
|
|
int count;
|
2018-03-21 18:46:21 +00:00
|
|
|
|
2022-08-31 20:15:59 +00:00
|
|
|
if (copy_from_user(&ioeventfd, arg, minsz))
|
2022-08-31 20:15:58 +00:00
|
|
|
return -EFAULT;
|
2018-03-21 18:46:21 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (ioeventfd.argsz < minsz)
|
|
|
|
return -EINVAL;
|
2018-03-21 18:46:21 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (ioeventfd.flags & ~VFIO_DEVICE_IOEVENTFD_SIZE_MASK)
|
|
|
|
return -EINVAL;
|
2018-03-21 18:46:21 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
count = ioeventfd.flags & VFIO_DEVICE_IOEVENTFD_SIZE_MASK;
|
2018-03-21 18:46:21 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
if (hweight8(count) != 1 || ioeventfd.fd < -1)
|
|
|
|
return -EINVAL;
|
2018-03-21 18:46:21 +00:00
|
|
|
|
2022-08-31 20:15:58 +00:00
|
|
|
return vfio_pci_ioeventfd(vdev, ioeventfd.offset, ioeventfd.data, count,
|
|
|
|
ioeventfd.fd);
|
2022-08-31 20:15:57 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
|
|
|
|
unsigned long arg)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
|
|
|
void __user *uarg = (void __user *)arg;
|
|
|
|
|
|
|
|
switch (cmd) {
|
|
|
|
case VFIO_DEVICE_GET_INFO:
|
|
|
|
return vfio_pci_ioctl_get_info(vdev, uarg);
|
|
|
|
case VFIO_DEVICE_GET_IRQ_INFO:
|
|
|
|
return vfio_pci_ioctl_get_irq_info(vdev, uarg);
|
|
|
|
case VFIO_DEVICE_GET_PCI_HOT_RESET_INFO:
|
|
|
|
return vfio_pci_ioctl_get_pci_hot_reset_info(vdev, uarg);
|
|
|
|
case VFIO_DEVICE_GET_REGION_INFO:
|
|
|
|
return vfio_pci_ioctl_get_region_info(vdev, uarg);
|
|
|
|
case VFIO_DEVICE_IOEVENTFD:
|
|
|
|
return vfio_pci_ioctl_ioeventfd(vdev, uarg);
|
|
|
|
case VFIO_DEVICE_PCI_HOT_RESET:
|
|
|
|
return vfio_pci_ioctl_pci_hot_reset(vdev, uarg);
|
|
|
|
case VFIO_DEVICE_RESET:
|
|
|
|
return vfio_pci_ioctl_reset(vdev, uarg);
|
|
|
|
case VFIO_DEVICE_SET_IRQS:
|
|
|
|
return vfio_pci_ioctl_set_irqs(vdev, uarg);
|
|
|
|
default:
|
|
|
|
return -ENOTTY;
|
2022-02-24 14:20:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl);
|
2020-03-24 15:28:27 +00:00
|
|
|
|
2022-02-24 14:20:17 +00:00
|
|
|
static int vfio_pci_core_feature_token(struct vfio_device *device, u32 flags,
|
2022-08-31 20:15:59 +00:00
|
|
|
uuid_t __user *arg, size_t argsz)
|
2022-02-24 14:20:17 +00:00
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(device, struct vfio_pci_core_device, vdev);
|
|
|
|
uuid_t uuid;
|
|
|
|
int ret;
|
2020-03-24 15:28:27 +00:00
|
|
|
|
2022-02-24 14:20:17 +00:00
|
|
|
if (!vdev->vf_token)
|
|
|
|
return -ENOTTY;
|
|
|
|
/*
|
|
|
|
* We do not support GET of the VF Token UUID as this could
|
|
|
|
* expose the token of the previous device user.
|
|
|
|
*/
|
|
|
|
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
|
|
|
|
sizeof(uuid));
|
|
|
|
if (ret != 1)
|
|
|
|
return ret;
|
2020-03-24 15:28:27 +00:00
|
|
|
|
2022-02-24 14:20:17 +00:00
|
|
|
if (copy_from_user(&uuid, arg, sizeof(uuid)))
|
|
|
|
return -EFAULT;
|
2020-03-24 15:28:27 +00:00
|
|
|
|
2022-02-24 14:20:17 +00:00
|
|
|
mutex_lock(&vdev->vf_token->lock);
|
|
|
|
uuid_copy(&vdev->vf_token->uuid, &uuid);
|
|
|
|
mutex_unlock(&vdev->vf_token->lock);
|
|
|
|
return 0;
|
|
|
|
}
|
2020-03-24 15:28:27 +00:00
|
|
|
|
2022-02-24 14:20:17 +00:00
|
|
|
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
|
|
|
|
void __user *arg, size_t argsz)
|
|
|
|
{
|
|
|
|
switch (flags & VFIO_DEVICE_FEATURE_MASK) {
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY:
|
|
|
|
return vfio_pci_core_pm_entry(device, flags, arg, argsz);
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:50 +00:00
|
|
|
case VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP:
|
|
|
|
return vfio_pci_core_pm_entry_with_wakeup(device, flags,
|
|
|
|
arg, argsz);
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
case VFIO_DEVICE_FEATURE_LOW_POWER_EXIT:
|
|
|
|
return vfio_pci_core_pm_exit(device, flags, arg, argsz);
|
2022-02-24 14:20:17 +00:00
|
|
|
case VFIO_DEVICE_FEATURE_PCI_VF_TOKEN:
|
|
|
|
return vfio_pci_core_feature_token(device, flags, arg, argsz);
|
|
|
|
default:
|
|
|
|
return -ENOTTY;
|
2013-09-04 17:28:04 +00:00
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2022-02-24 14:20:17 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_ioctl_feature);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static ssize_t vfio_pci_rw(struct vfio_pci_core_device *vdev, char __user *buf,
|
2013-02-14 21:02:12 +00:00
|
|
|
size_t count, loff_t *ppos, bool iswrite)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
|
|
|
unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
int ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2016-02-22 23:02:39 +00:00
|
|
|
if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
|
2012-07-31 14:16:24 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
ret = pm_runtime_resume_and_get(&vdev->pdev->dev);
|
|
|
|
if (ret) {
|
|
|
|
pci_info_ratelimited(vdev->pdev, "runtime resume failed %d\n",
|
|
|
|
ret);
|
|
|
|
return -EIO;
|
|
|
|
}
|
|
|
|
|
2013-02-14 21:02:12 +00:00
|
|
|
switch (index) {
|
|
|
|
case VFIO_PCI_CONFIG_REGION_INDEX:
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
ret = vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
|
|
|
|
break;
|
2013-02-14 21:02:12 +00:00
|
|
|
|
2013-02-14 21:02:12 +00:00
|
|
|
case VFIO_PCI_ROM_REGION_INDEX:
|
|
|
|
if (iswrite)
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
ret = -EINVAL;
|
|
|
|
else
|
|
|
|
ret = vfio_pci_bar_rw(vdev, buf, count, ppos, false);
|
|
|
|
break;
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2013-02-14 21:02:12 +00:00
|
|
|
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
ret = vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
|
|
|
|
break;
|
2013-02-18 17:11:13 +00:00
|
|
|
|
|
|
|
case VFIO_PCI_VGA_REGION_INDEX:
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
ret = vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
|
|
|
|
break;
|
|
|
|
|
2016-02-22 23:02:39 +00:00
|
|
|
default:
|
|
|
|
index -= VFIO_PCI_NUM_REGIONS;
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
ret = vdev->region[index].ops->rw(vdev, buf,
|
2016-02-22 23:02:39 +00:00
|
|
|
count, ppos, iswrite);
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
break;
|
2013-02-14 21:02:12 +00:00
|
|
|
}
|
|
|
|
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
pm_runtime_put(&vdev->pdev->dev);
|
|
|
|
return ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
|
|
|
|
size_t count, loff_t *ppos)
|
2013-02-14 21:02:12 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
2021-03-30 15:53:08 +00:00
|
|
|
|
2013-02-14 21:02:12 +00:00
|
|
|
if (!count)
|
|
|
|
return 0;
|
|
|
|
|
2021-03-30 15:53:08 +00:00
|
|
|
return vfio_pci_rw(vdev, buf, count, ppos, false);
|
2013-02-14 21:02:12 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_read);
|
2013-02-14 21:02:12 +00:00
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
|
|
|
|
size_t count, loff_t *ppos)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
2021-03-30 15:53:08 +00:00
|
|
|
|
2013-02-14 21:02:12 +00:00
|
|
|
if (!count)
|
|
|
|
return 0;
|
|
|
|
|
2021-03-30 15:53:08 +00:00
|
|
|
return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true);
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_write);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev)
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
{
|
2024-05-30 04:52:31 +00:00
|
|
|
struct vfio_device *core_vdev = &vdev->vdev;
|
|
|
|
loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX);
|
|
|
|
loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX);
|
|
|
|
loff_t len = end - start;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true);
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
{
|
|
|
|
down_write(&vdev->memory_lock);
|
2024-05-30 04:52:31 +00:00
|
|
|
vfio_pci_zap_bars(vdev);
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
{
|
|
|
|
u16 cmd;
|
|
|
|
|
|
|
|
down_write(&vdev->memory_lock);
|
|
|
|
pci_read_config_word(vdev->pdev, PCI_COMMAND, &cmd);
|
|
|
|
if (!(cmd & PCI_COMMAND_MEMORY))
|
|
|
|
pci_write_config_word(vdev->pdev, PCI_COMMAND,
|
|
|
|
cmd | PCI_COMMAND_MEMORY);
|
|
|
|
|
|
|
|
return cmd;
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd)
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
{
|
|
|
|
pci_write_config_word(vdev->pdev, PCI_COMMAND, cmd);
|
|
|
|
up_write(&vdev->memory_lock);
|
|
|
|
}
|
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
static unsigned long vma_to_pfn(struct vm_area_struct *vma)
|
2020-04-28 19:12:20 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev = vma->vm_private_data;
|
2024-05-30 04:52:31 +00:00
|
|
|
int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
|
|
|
|
u64 pgoff;
|
2020-04-28 19:12:20 +00:00
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
pgoff = vma->vm_pgoff &
|
|
|
|
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
|
|
|
|
|
|
|
|
return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
|
2020-04-28 19:12:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
|
|
|
|
{
|
|
|
|
struct vm_area_struct *vma = vmf->vma;
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev = vma->vm_private_data;
|
2024-05-30 04:52:31 +00:00
|
|
|
unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
|
2024-06-07 03:52:07 +00:00
|
|
|
unsigned long addr = vma->vm_start;
|
2024-05-30 04:52:31 +00:00
|
|
|
vm_fault_t ret = VM_FAULT_SIGBUS;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
pfn = vma_to_pfn(vma);
|
2020-04-28 19:12:20 +00:00
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
down_read(&vdev->memory_lock);
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
|
2024-06-07 03:52:07 +00:00
|
|
|
goto out_unlock;
|
2021-06-28 20:08:12 +00:00
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
|
2024-06-07 03:52:07 +00:00
|
|
|
if (ret & VM_FAULT_ERROR)
|
|
|
|
goto out_unlock;
|
2020-04-28 19:12:20 +00:00
|
|
|
|
2024-06-07 03:52:07 +00:00
|
|
|
/*
|
|
|
|
* Pre-fault the remainder of the vma, abort further insertions and
|
|
|
|
* supress error if fault is encountered during pre-fault.
|
|
|
|
*/
|
|
|
|
for (; addr < vma->vm_end; addr += PAGE_SIZE, pfn++) {
|
|
|
|
if (addr == vmf->address)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out_unlock:
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
up_read(&vdev->memory_lock);
|
2024-05-30 04:52:31 +00:00
|
|
|
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
return ret;
|
2020-04-28 19:12:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static const struct vm_operations_struct vfio_pci_mmap_ops = {
|
|
|
|
.fault = vfio_pci_mmap_fault,
|
|
|
|
};
|
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
2012-07-31 14:16:24 +00:00
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
|
|
unsigned int index;
|
2012-10-10 15:10:31 +00:00
|
|
|
u64 phys_len, req_len, pgoff, req_start;
|
2012-07-31 14:16:24 +00:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
|
|
|
|
|
2021-04-12 21:41:24 +00:00
|
|
|
if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions)
|
|
|
|
return -EINVAL;
|
2012-07-31 14:16:24 +00:00
|
|
|
if (vma->vm_end < vma->vm_start)
|
|
|
|
return -EINVAL;
|
|
|
|
if ((vma->vm_flags & VM_SHARED) == 0)
|
|
|
|
return -EINVAL;
|
2018-12-19 08:52:30 +00:00
|
|
|
if (index >= VFIO_PCI_NUM_REGIONS) {
|
|
|
|
int regnum = index - VFIO_PCI_NUM_REGIONS;
|
|
|
|
struct vfio_pci_region *region = vdev->region + regnum;
|
|
|
|
|
2021-04-12 21:41:24 +00:00
|
|
|
if (region->ops && region->ops->mmap &&
|
2018-12-19 08:52:30 +00:00
|
|
|
(region->flags & VFIO_REGION_INFO_FLAG_MMAP))
|
|
|
|
return region->ops->mmap(vdev, region, vma);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
if (index >= VFIO_PCI_ROM_REGION_INDEX)
|
|
|
|
return -EINVAL;
|
2016-06-30 07:21:24 +00:00
|
|
|
if (!vdev->bar_mmap_supported[index])
|
2012-07-31 14:16:24 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2016-06-30 07:21:24 +00:00
|
|
|
phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
|
2012-07-31 14:16:24 +00:00
|
|
|
req_len = vma->vm_end - vma->vm_start;
|
|
|
|
pgoff = vma->vm_pgoff &
|
|
|
|
((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
|
|
|
|
req_start = pgoff << PAGE_SHIFT;
|
|
|
|
|
2016-06-30 07:21:24 +00:00
|
|
|
if (req_start + req_len > phys_len)
|
2012-07-31 14:16:24 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Even though we don't make use of the barmap for the mmap,
|
|
|
|
* we need to request the region and the barmap tracks that.
|
|
|
|
*/
|
|
|
|
if (!vdev->barmap[index]) {
|
|
|
|
ret = pci_request_selected_regions(pdev,
|
|
|
|
1 << index, "vfio-pci");
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
vdev->barmap[index] = pci_iomap(pdev, index, 0);
|
2017-01-03 11:56:46 +00:00
|
|
|
if (!vdev->barmap[index]) {
|
|
|
|
pci_release_selected_regions(pdev, 1 << index);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
vma->vm_private_data = vdev;
|
|
|
|
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
|
2024-05-30 04:52:31 +00:00
|
|
|
vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2020-04-28 19:12:20 +00:00
|
|
|
/*
|
2024-05-30 04:52:31 +00:00
|
|
|
* Set vm_flags now, they should not be changed in the fault handler.
|
|
|
|
* We want the same flags and page protection (decrypted above) as
|
|
|
|
* io_remap_pfn_range() would set.
|
2024-02-24 15:05:46 +00:00
|
|
|
*
|
|
|
|
* VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
|
|
|
|
* allowing KVM stage 2 device mapping attributes to use Normal-NC
|
|
|
|
* rather than DEVICE_nGnRE, which allows guest mappings
|
|
|
|
* supporting write-combining attributes (WC). ARM does not
|
|
|
|
* architecturally guarantee this is safe, and indeed some MMIO
|
|
|
|
* regions like the GICv2 VCPU interface can trigger uncontained
|
|
|
|
* faults if Normal-NC is used.
|
|
|
|
*
|
|
|
|
* To safely use VFIO in KVM the platform must guarantee full
|
|
|
|
* safety in the guest where no action taken against a MMIO
|
|
|
|
* mapping can trigger an uncontained failure. The assumption is
|
|
|
|
* that most VFIO PCI platforms support this for both mapping types,
|
|
|
|
* at least in common flows, based on some expectations of how
|
|
|
|
* PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
|
|
|
|
* the VMA flags.
|
2020-04-28 19:12:20 +00:00
|
|
|
*/
|
2024-02-24 15:05:46 +00:00
|
|
|
vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
|
|
|
|
VM_DONTEXPAND | VM_DONTDUMP);
|
2020-04-28 19:12:20 +00:00
|
|
|
vma->vm_ops = &vfio_pci_mmap_ops;
|
|
|
|
|
|
|
|
return 0;
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_mmap);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
|
2015-02-06 22:05:08 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
2019-03-30 14:41:35 +00:00
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
2015-02-06 22:05:08 +00:00
|
|
|
|
|
|
|
mutex_lock(&vdev->igate);
|
|
|
|
|
|
|
|
if (vdev->req_trigger) {
|
2015-04-28 16:23:30 +00:00
|
|
|
if (!(count % 10))
|
2019-03-30 14:41:35 +00:00
|
|
|
pci_notice_ratelimited(pdev,
|
2015-04-28 16:23:30 +00:00
|
|
|
"Relaying device request to user (#%u)\n",
|
|
|
|
count);
|
2023-11-22 12:48:23 +00:00
|
|
|
eventfd_signal(vdev->req_trigger);
|
2015-04-28 16:23:30 +00:00
|
|
|
} else if (count == 0) {
|
2019-03-30 14:41:35 +00:00
|
|
|
pci_warn(pdev,
|
2015-04-28 16:23:30 +00:00
|
|
|
"No device request channel registered, blocked until released by user\n");
|
2015-02-06 22:05:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&vdev->igate);
|
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_request);
|
2015-02-06 22:05:08 +00:00
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
|
vfio/pci: Introduce VF token
If we enable SR-IOV on a vfio-pci owned PF, the resulting VFs are not
fully isolated from the PF. The PF can always cause a denial of service
to the VF, even if by simply resetting itself. The degree to which a PF
can access the data passed through a VF or interfere with its operation
is dependent on a given SR-IOV implementation. Therefore we want to
avoid a scenario where an existing vfio-pci based userspace driver might
assume the PF driver is trusted, for example assigning a PF to one VM
and VF to another with some expectation of isolation. IOMMU grouping
could be a solution to this, but imposes an unnecessarily strong
relationship between PF and VF drivers if they need to operate with the
same IOMMU context. Instead we introduce a "VF token", which is
essentially just a shared secret between PF and VF drivers, implemented
as a UUID.
The VF token can be set by a vfio-pci based PF driver and must be known
by the vfio-pci based VF driver in order to gain access to the device.
This allows the degree to which this VF token is considered secret to be
determined by the applications and environment. For example a VM might
generate a random UUID known only internally to the hypervisor while a
userspace networking appliance might use a shared, or even well know,
UUID among the application drivers.
To incorporate this VF token, the VFIO_GROUP_GET_DEVICE_FD interface is
extended to accept key=value pairs in addition to the device name. This
allows us to most easily deny user access to the device without risk
that existing userspace drivers assume region offsets, IRQs, and other
device features, leading to more elaborate error paths. The format of
these options are expected to take the form:
"$DEVICE_NAME $OPTION1=$VALUE1 $OPTION2=$VALUE2"
Where the device name is always provided first for compatibility and
additional options are specified in a space separated list. The
relation between and requirements for the additional options will be
vfio bus driver dependent, however unknown or unused option within this
schema should return error. This allow for future use of unknown
options as well as a positive indication to the user that an option is
used.
An example VF token option would take this form:
"0000:03:00.0 vf_token=2ab74924-c335-45f4-9b16-8569e5b08258"
When accessing a VF where the PF is making use of vfio-pci, the user
MUST provide the current vf_token. When accessing a PF, the user MUST
provide the current vf_token IF there are active VF users or MAY provide
a vf_token in order to set the current VF token when no VF users are
active. The former requirement assures VF users that an unassociated
driver cannot usurp the PF device. These semantics also imply that a
VF token MUST be set by a PF driver before VF drivers can access their
device, the default token is random and mechanisms to read the token are
not provided in order to protect the VF token of previous users. Use of
the vf_token option outside of these cases will return an error, as
discussed above.
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-03-24 15:28:27 +00:00
|
|
|
bool vf_token, uuid_t *uuid)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* There's always some degree of trust or collaboration between SR-IOV
|
|
|
|
* PF and VFs, even if just that the PF hosts the SR-IOV capability and
|
|
|
|
* can disrupt VFs with a reset, but often the PF has more explicit
|
|
|
|
* access to deny service to the VF or access data passed through the
|
|
|
|
* VF. We therefore require an opt-in via a shared VF token (UUID) to
|
|
|
|
* represent this trust. This both prevents that a VF driver might
|
|
|
|
* assume the PF driver is a trusted, in-kernel driver, and also that
|
|
|
|
* a PF driver might be replaced with a rogue driver, unknown to in-use
|
|
|
|
* VF drivers.
|
|
|
|
*
|
|
|
|
* Therefore when presented with a VF, if the PF is a vfio device and
|
|
|
|
* it is bound to the vfio-pci driver, the user needs to provide a VF
|
|
|
|
* token to access the device, in the form of appending a vf_token to
|
|
|
|
* the device name, for example:
|
|
|
|
*
|
|
|
|
* "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3"
|
|
|
|
*
|
|
|
|
* When presented with a PF which has VFs in use, the user must also
|
|
|
|
* provide the current VF token to prove collaboration with existing
|
|
|
|
* VF users. If VFs are not in use, the VF token provided for the PF
|
|
|
|
* device will act to set the VF token.
|
|
|
|
*
|
|
|
|
* If the VF token is provided but unused, an error is generated.
|
|
|
|
*/
|
|
|
|
if (vdev->pdev->is_virtfn) {
|
2022-04-13 13:10:36 +00:00
|
|
|
struct vfio_pci_core_device *pf_vdev = vdev->sriov_pf_core_dev;
|
vfio/pci: Introduce VF token
If we enable SR-IOV on a vfio-pci owned PF, the resulting VFs are not
fully isolated from the PF. The PF can always cause a denial of service
to the VF, even if by simply resetting itself. The degree to which a PF
can access the data passed through a VF or interfere with its operation
is dependent on a given SR-IOV implementation. Therefore we want to
avoid a scenario where an existing vfio-pci based userspace driver might
assume the PF driver is trusted, for example assigning a PF to one VM
and VF to another with some expectation of isolation. IOMMU grouping
could be a solution to this, but imposes an unnecessarily strong
relationship between PF and VF drivers if they need to operate with the
same IOMMU context. Instead we introduce a "VF token", which is
essentially just a shared secret between PF and VF drivers, implemented
as a UUID.
The VF token can be set by a vfio-pci based PF driver and must be known
by the vfio-pci based VF driver in order to gain access to the device.
This allows the degree to which this VF token is considered secret to be
determined by the applications and environment. For example a VM might
generate a random UUID known only internally to the hypervisor while a
userspace networking appliance might use a shared, or even well know,
UUID among the application drivers.
To incorporate this VF token, the VFIO_GROUP_GET_DEVICE_FD interface is
extended to accept key=value pairs in addition to the device name. This
allows us to most easily deny user access to the device without risk
that existing userspace drivers assume region offsets, IRQs, and other
device features, leading to more elaborate error paths. The format of
these options are expected to take the form:
"$DEVICE_NAME $OPTION1=$VALUE1 $OPTION2=$VALUE2"
Where the device name is always provided first for compatibility and
additional options are specified in a space separated list. The
relation between and requirements for the additional options will be
vfio bus driver dependent, however unknown or unused option within this
schema should return error. This allow for future use of unknown
options as well as a positive indication to the user that an option is
used.
An example VF token option would take this form:
"0000:03:00.0 vf_token=2ab74924-c335-45f4-9b16-8569e5b08258"
When accessing a VF where the PF is making use of vfio-pci, the user
MUST provide the current vf_token. When accessing a PF, the user MUST
provide the current vf_token IF there are active VF users or MAY provide
a vf_token in order to set the current VF token when no VF users are
active. The former requirement assures VF users that an unassociated
driver cannot usurp the PF device. These semantics also imply that a
VF token MUST be set by a PF driver before VF drivers can access their
device, the default token is random and mechanisms to read the token are
not provided in order to protect the VF token of previous users. Use of
the vf_token option outside of these cases will return an error, as
discussed above.
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-03-24 15:28:27 +00:00
|
|
|
bool match;
|
|
|
|
|
|
|
|
if (!pf_vdev) {
|
|
|
|
if (!vf_token)
|
|
|
|
return 0; /* PF is not vfio-pci, no VF token */
|
|
|
|
|
|
|
|
pci_info_ratelimited(vdev->pdev,
|
|
|
|
"VF token incorrectly provided, PF not bound to vfio-pci\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!vf_token) {
|
|
|
|
pci_info_ratelimited(vdev->pdev,
|
|
|
|
"VF token required to access device\n");
|
|
|
|
return -EACCES;
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_lock(&pf_vdev->vf_token->lock);
|
|
|
|
match = uuid_equal(uuid, &pf_vdev->vf_token->uuid);
|
|
|
|
mutex_unlock(&pf_vdev->vf_token->lock);
|
|
|
|
|
|
|
|
if (!match) {
|
|
|
|
pci_info_ratelimited(vdev->pdev,
|
|
|
|
"Incorrect VF token provided for device\n");
|
|
|
|
return -EACCES;
|
|
|
|
}
|
|
|
|
} else if (vdev->vf_token) {
|
|
|
|
mutex_lock(&vdev->vf_token->lock);
|
|
|
|
if (vdev->vf_token->users) {
|
|
|
|
if (!vf_token) {
|
|
|
|
mutex_unlock(&vdev->vf_token->lock);
|
|
|
|
pci_info_ratelimited(vdev->pdev,
|
|
|
|
"VF token required to access device\n");
|
|
|
|
return -EACCES;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!uuid_equal(uuid, &vdev->vf_token->uuid)) {
|
|
|
|
mutex_unlock(&vdev->vf_token->lock);
|
|
|
|
pci_info_ratelimited(vdev->pdev,
|
|
|
|
"Incorrect VF token provided for device\n");
|
|
|
|
return -EACCES;
|
|
|
|
}
|
|
|
|
} else if (vf_token) {
|
|
|
|
uuid_copy(&vdev->vf_token->uuid, uuid);
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_unlock(&vdev->vf_token->lock);
|
|
|
|
} else if (vf_token) {
|
|
|
|
pci_info_ratelimited(vdev->pdev,
|
|
|
|
"VF token incorrectly provided, not a PF or VF\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define VF_TOKEN_ARG "vf_token="
|
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
|
2020-03-24 15:28:26 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
vfio/pci: Introduce VF token
If we enable SR-IOV on a vfio-pci owned PF, the resulting VFs are not
fully isolated from the PF. The PF can always cause a denial of service
to the VF, even if by simply resetting itself. The degree to which a PF
can access the data passed through a VF or interfere with its operation
is dependent on a given SR-IOV implementation. Therefore we want to
avoid a scenario where an existing vfio-pci based userspace driver might
assume the PF driver is trusted, for example assigning a PF to one VM
and VF to another with some expectation of isolation. IOMMU grouping
could be a solution to this, but imposes an unnecessarily strong
relationship between PF and VF drivers if they need to operate with the
same IOMMU context. Instead we introduce a "VF token", which is
essentially just a shared secret between PF and VF drivers, implemented
as a UUID.
The VF token can be set by a vfio-pci based PF driver and must be known
by the vfio-pci based VF driver in order to gain access to the device.
This allows the degree to which this VF token is considered secret to be
determined by the applications and environment. For example a VM might
generate a random UUID known only internally to the hypervisor while a
userspace networking appliance might use a shared, or even well know,
UUID among the application drivers.
To incorporate this VF token, the VFIO_GROUP_GET_DEVICE_FD interface is
extended to accept key=value pairs in addition to the device name. This
allows us to most easily deny user access to the device without risk
that existing userspace drivers assume region offsets, IRQs, and other
device features, leading to more elaborate error paths. The format of
these options are expected to take the form:
"$DEVICE_NAME $OPTION1=$VALUE1 $OPTION2=$VALUE2"
Where the device name is always provided first for compatibility and
additional options are specified in a space separated list. The
relation between and requirements for the additional options will be
vfio bus driver dependent, however unknown or unused option within this
schema should return error. This allow for future use of unknown
options as well as a positive indication to the user that an option is
used.
An example VF token option would take this form:
"0000:03:00.0 vf_token=2ab74924-c335-45f4-9b16-8569e5b08258"
When accessing a VF where the PF is making use of vfio-pci, the user
MUST provide the current vf_token. When accessing a PF, the user MUST
provide the current vf_token IF there are active VF users or MAY provide
a vf_token in order to set the current VF token when no VF users are
active. The former requirement assures VF users that an unassociated
driver cannot usurp the PF device. These semantics also imply that a
VF token MUST be set by a PF driver before VF drivers can access their
device, the default token is random and mechanisms to read the token are
not provided in order to protect the VF token of previous users. Use of
the vf_token option outside of these cases will return an error, as
discussed above.
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-03-24 15:28:27 +00:00
|
|
|
bool vf_token = false;
|
|
|
|
uuid_t uuid;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (strncmp(pci_name(vdev->pdev), buf, strlen(pci_name(vdev->pdev))))
|
|
|
|
return 0; /* No match */
|
|
|
|
|
|
|
|
if (strlen(buf) > strlen(pci_name(vdev->pdev))) {
|
|
|
|
buf += strlen(pci_name(vdev->pdev));
|
|
|
|
|
|
|
|
if (*buf != ' ')
|
|
|
|
return 0; /* No match: non-whitespace after name */
|
|
|
|
|
|
|
|
while (*buf) {
|
|
|
|
if (*buf == ' ') {
|
|
|
|
buf++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!vf_token && !strncmp(buf, VF_TOKEN_ARG,
|
|
|
|
strlen(VF_TOKEN_ARG))) {
|
|
|
|
buf += strlen(VF_TOKEN_ARG);
|
|
|
|
|
|
|
|
if (strlen(buf) < UUID_STRING_LEN)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
ret = uuid_parse(buf, &uuid);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
2020-03-24 15:28:26 +00:00
|
|
|
|
vfio/pci: Introduce VF token
If we enable SR-IOV on a vfio-pci owned PF, the resulting VFs are not
fully isolated from the PF. The PF can always cause a denial of service
to the VF, even if by simply resetting itself. The degree to which a PF
can access the data passed through a VF or interfere with its operation
is dependent on a given SR-IOV implementation. Therefore we want to
avoid a scenario where an existing vfio-pci based userspace driver might
assume the PF driver is trusted, for example assigning a PF to one VM
and VF to another with some expectation of isolation. IOMMU grouping
could be a solution to this, but imposes an unnecessarily strong
relationship between PF and VF drivers if they need to operate with the
same IOMMU context. Instead we introduce a "VF token", which is
essentially just a shared secret between PF and VF drivers, implemented
as a UUID.
The VF token can be set by a vfio-pci based PF driver and must be known
by the vfio-pci based VF driver in order to gain access to the device.
This allows the degree to which this VF token is considered secret to be
determined by the applications and environment. For example a VM might
generate a random UUID known only internally to the hypervisor while a
userspace networking appliance might use a shared, or even well know,
UUID among the application drivers.
To incorporate this VF token, the VFIO_GROUP_GET_DEVICE_FD interface is
extended to accept key=value pairs in addition to the device name. This
allows us to most easily deny user access to the device without risk
that existing userspace drivers assume region offsets, IRQs, and other
device features, leading to more elaborate error paths. The format of
these options are expected to take the form:
"$DEVICE_NAME $OPTION1=$VALUE1 $OPTION2=$VALUE2"
Where the device name is always provided first for compatibility and
additional options are specified in a space separated list. The
relation between and requirements for the additional options will be
vfio bus driver dependent, however unknown or unused option within this
schema should return error. This allow for future use of unknown
options as well as a positive indication to the user that an option is
used.
An example VF token option would take this form:
"0000:03:00.0 vf_token=2ab74924-c335-45f4-9b16-8569e5b08258"
When accessing a VF where the PF is making use of vfio-pci, the user
MUST provide the current vf_token. When accessing a PF, the user MUST
provide the current vf_token IF there are active VF users or MAY provide
a vf_token in order to set the current VF token when no VF users are
active. The former requirement assures VF users that an unassociated
driver cannot usurp the PF device. These semantics also imply that a
VF token MUST be set by a PF driver before VF drivers can access their
device, the default token is random and mechanisms to read the token are
not provided in order to protect the VF token of previous users. Use of
the vf_token option outside of these cases will return an error, as
discussed above.
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-03-24 15:28:27 +00:00
|
|
|
vf_token = true;
|
|
|
|
buf += UUID_STRING_LEN;
|
|
|
|
} else {
|
|
|
|
/* Unknown/duplicate option */
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return 1; /* Match */
|
2020-03-24 15:28:26 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_match);
|
2020-03-24 15:28:26 +00:00
|
|
|
|
2020-03-24 15:28:28 +00:00
|
|
|
static int vfio_pci_bus_notifier(struct notifier_block *nb,
|
|
|
|
unsigned long action, void *data)
|
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *vdev = container_of(nb,
|
|
|
|
struct vfio_pci_core_device, nb);
|
2020-03-24 15:28:28 +00:00
|
|
|
struct device *dev = data;
|
|
|
|
struct pci_dev *pdev = to_pci_dev(dev);
|
|
|
|
struct pci_dev *physfn = pci_physfn(pdev);
|
|
|
|
|
|
|
|
if (action == BUS_NOTIFY_ADD_DEVICE &&
|
|
|
|
pdev->is_virtfn && physfn == vdev->pdev) {
|
|
|
|
pci_info(vdev->pdev, "Captured SR-IOV VF %s driver_override\n",
|
|
|
|
pci_name(pdev));
|
|
|
|
pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
|
2021-08-26 10:39:05 +00:00
|
|
|
vdev->vdev.ops->name);
|
2024-01-15 06:34:34 +00:00
|
|
|
WARN_ON(!pdev->driver_override);
|
2020-03-24 15:28:28 +00:00
|
|
|
} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
|
|
|
|
pdev->is_virtfn && physfn == vdev->pdev) {
|
|
|
|
struct pci_driver *drv = pci_dev_driver(pdev);
|
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
if (drv && drv != pci_dev_driver(vdev->pdev))
|
2020-03-24 15:28:28 +00:00
|
|
|
pci_warn(vdev->pdev,
|
2021-08-26 10:39:05 +00:00
|
|
|
"VF %s bound to driver %s while PF bound to driver %s\n",
|
|
|
|
pci_name(pdev), drv->name,
|
|
|
|
pci_dev_driver(vdev->pdev)->name);
|
2020-03-24 15:28:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
2018-12-12 19:51:07 +00:00
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static int vfio_pci_vf_init(struct vfio_pci_core_device *vdev)
|
2021-03-30 15:53:06 +00:00
|
|
|
{
|
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
2022-04-13 13:10:36 +00:00
|
|
|
struct vfio_pci_core_device *cur;
|
|
|
|
struct pci_dev *physfn;
|
2021-03-30 15:53:06 +00:00
|
|
|
int ret;
|
|
|
|
|
2022-04-13 13:10:36 +00:00
|
|
|
if (pdev->is_virtfn) {
|
|
|
|
/*
|
|
|
|
* If this VF was created by our vfio_pci_core_sriov_configure()
|
|
|
|
* then we can find the PF vfio_pci_core_device now, and due to
|
|
|
|
* the locking in pci_disable_sriov() it cannot change until
|
|
|
|
* this VF device driver is removed.
|
|
|
|
*/
|
|
|
|
physfn = pci_physfn(vdev->pdev);
|
|
|
|
mutex_lock(&vfio_pci_sriov_pfs_mutex);
|
|
|
|
list_for_each_entry(cur, &vfio_pci_sriov_pfs, sriov_pfs_item) {
|
|
|
|
if (cur->pdev == physfn) {
|
|
|
|
vdev->sriov_pf_core_dev = cur;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_unlock(&vfio_pci_sriov_pfs_mutex);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Not a SRIOV PF */
|
2021-03-30 15:53:06 +00:00
|
|
|
if (!pdev->is_physfn)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL);
|
|
|
|
if (!vdev->vf_token)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
mutex_init(&vdev->vf_token->lock);
|
|
|
|
uuid_gen(&vdev->vf_token->uuid);
|
|
|
|
|
|
|
|
vdev->nb.notifier_call = vfio_pci_bus_notifier;
|
|
|
|
ret = bus_register_notifier(&pci_bus_type, &vdev->nb);
|
|
|
|
if (ret) {
|
|
|
|
kfree(vdev->vf_token);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static void vfio_pci_vf_uninit(struct vfio_pci_core_device *vdev)
|
2021-03-30 15:53:06 +00:00
|
|
|
{
|
|
|
|
if (!vdev->vf_token)
|
|
|
|
return;
|
|
|
|
|
|
|
|
bus_unregister_notifier(&pci_bus_type, &vdev->nb);
|
|
|
|
WARN_ON(vdev->vf_token->users);
|
|
|
|
mutex_destroy(&vdev->vf_token->lock);
|
|
|
|
kfree(vdev->vf_token);
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static int vfio_pci_vga_init(struct vfio_pci_core_device *vdev)
|
2021-03-30 15:53:06 +00:00
|
|
|
{
|
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!vfio_pci_is_vga(pdev))
|
|
|
|
return 0;
|
|
|
|
|
2022-06-22 14:01:34 +00:00
|
|
|
ret = aperture_remove_conflicting_pci_devices(pdev, vdev->vdev.ops->name);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
VFIO update for v5.15-rc1
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching
to support future vendor provided vfio-pci variants (Yishai Hadas,
Max Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first
open, last close, and device sets (Jason Gunthorpe, Max Gurtovoy,
Yishai Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
-----BEGIN PGP SIGNATURE-----
iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmEvwWkbHGFsZXgud2ls
bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi+1UP/3CRizghroINVYR+cJ99
Tjz7lB/wlzxmRfX+SL4NAVe1SSB2VeCgU4B0PF6kywELLS8OhCO3HXYXVsz244fW
Gk5UIns86+TFTrfCOMpwYBV0P86zuaa1ZnvCnkhMK1i2pTZ+oX8hUH1Yj5clHuU+
YgC7JfEuTIAX73q2bC/llLvNE9ke1QCoDX3+HAH87ttqutnRWcnnq56PTEqwe+EW
eMA+glB1UG6JAqXxoJET4155arNOny1/ZMprfBr3YXZTiXDF/lSzuMyUtbp526Sf
hsvlnqtE6TCdfKbog0Lxckl+8E9NCq8jzFBKiZhbccrQv3vVaoP6dOsPWcT35Kp1
IjzMLiHIbl4wXOL+Xap/biz3LCM5BMdT/OhW5LUC007zggK71ndRvb9F8ptW83Bv
0Uh9DNv7YIQ0su3JHZEsJ3qPFXQXceP199UiADOGSeV8U1Qig3YKsHUDMuALfFvN
t+NleeJ4qCWao+W4VCfyDfKurVnMj/cThXiDEWEeq5gMOO+6YKBIFWJVKFxUYDbf
MgGdg0nQTUECuXKXxLD4c1HAWH9xi207OnLvhW1Icywp20MsYqOWt0vhg+PRdMBT
DK6STxP18aQxCaOuQN9Vf81LjhXNTeg+xt3mMyViOZPcKfX6/wAC9qLt4MucJDdw
FBfOz2UL2F56dhAYT+1vHoUM
=nzK7
-----END PGP SIGNATURE-----
Merge tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching to
support future vendor provided vfio-pci variants (Yishai Hadas, Max
Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first open,
last close, and device sets (Jason Gunthorpe, Max Gurtovoy, Yishai
Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
* tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio: (37 commits)
vfio/pci: Introduce vfio_pci_core.ko
vfio: Use kconfig if XX/endif blocks instead of repeating 'depends on'
vfio: Use select for eventfd
PCI / VFIO: Add 'override_only' support for VFIO PCI sub system
PCI: Add 'override_only' field to struct pci_device_id
vfio/pci: Move module parameters to vfio_pci.c
vfio/pci: Move igd initialization to vfio_pci.c
vfio/pci: Split the pci_driver code out of vfio_pci_core.c
vfio/pci: Include vfio header in vfio_pci_core.h
vfio/pci: Rename ops functions to fit core namings
vfio/pci: Rename vfio_pci_device to vfio_pci_core_device
vfio/pci: Rename vfio_pci_private.h to vfio_pci_core.h
vfio/pci: Rename vfio_pci.c to vfio_pci_core.c
vfio/ap_ops: Convert to use vfio_register_group_dev()
s390/vfio-ap: replace open coded locks for VFIO_GROUP_NOTIFY_SET_KVM notification
s390/vfio-ap: r/w lock for PQAP interception handler function pointer
vfio/type1: Fix vfio_find_dma_valid return
vfio-pci/zdev: Remove repeated verbose license text
vfio: platform: reset: Convert to SPDX identifier
vfio: Remove struct vfio_device_ops open/release
...
2021-09-02 20:41:33 +00:00
|
|
|
ret = vga_client_register(pdev, vfio_pci_set_decode);
|
2021-03-30 15:53:06 +00:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
VFIO update for v5.15-rc1
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching
to support future vendor provided vfio-pci variants (Yishai Hadas,
Max Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first
open, last close, and device sets (Jason Gunthorpe, Max Gurtovoy,
Yishai Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
-----BEGIN PGP SIGNATURE-----
iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmEvwWkbHGFsZXgud2ls
bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi+1UP/3CRizghroINVYR+cJ99
Tjz7lB/wlzxmRfX+SL4NAVe1SSB2VeCgU4B0PF6kywELLS8OhCO3HXYXVsz244fW
Gk5UIns86+TFTrfCOMpwYBV0P86zuaa1ZnvCnkhMK1i2pTZ+oX8hUH1Yj5clHuU+
YgC7JfEuTIAX73q2bC/llLvNE9ke1QCoDX3+HAH87ttqutnRWcnnq56PTEqwe+EW
eMA+glB1UG6JAqXxoJET4155arNOny1/ZMprfBr3YXZTiXDF/lSzuMyUtbp526Sf
hsvlnqtE6TCdfKbog0Lxckl+8E9NCq8jzFBKiZhbccrQv3vVaoP6dOsPWcT35Kp1
IjzMLiHIbl4wXOL+Xap/biz3LCM5BMdT/OhW5LUC007zggK71ndRvb9F8ptW83Bv
0Uh9DNv7YIQ0su3JHZEsJ3qPFXQXceP199UiADOGSeV8U1Qig3YKsHUDMuALfFvN
t+NleeJ4qCWao+W4VCfyDfKurVnMj/cThXiDEWEeq5gMOO+6YKBIFWJVKFxUYDbf
MgGdg0nQTUECuXKXxLD4c1HAWH9xi207OnLvhW1Icywp20MsYqOWt0vhg+PRdMBT
DK6STxP18aQxCaOuQN9Vf81LjhXNTeg+xt3mMyViOZPcKfX6/wAC9qLt4MucJDdw
FBfOz2UL2F56dhAYT+1vHoUM
=nzK7
-----END PGP SIGNATURE-----
Merge tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching to
support future vendor provided vfio-pci variants (Yishai Hadas, Max
Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first open,
last close, and device sets (Jason Gunthorpe, Max Gurtovoy, Yishai
Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
* tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio: (37 commits)
vfio/pci: Introduce vfio_pci_core.ko
vfio: Use kconfig if XX/endif blocks instead of repeating 'depends on'
vfio: Use select for eventfd
PCI / VFIO: Add 'override_only' support for VFIO PCI sub system
PCI: Add 'override_only' field to struct pci_device_id
vfio/pci: Move module parameters to vfio_pci.c
vfio/pci: Move igd initialization to vfio_pci.c
vfio/pci: Split the pci_driver code out of vfio_pci_core.c
vfio/pci: Include vfio header in vfio_pci_core.h
vfio/pci: Rename ops functions to fit core namings
vfio/pci: Rename vfio_pci_device to vfio_pci_core_device
vfio/pci: Rename vfio_pci_private.h to vfio_pci_core.h
vfio/pci: Rename vfio_pci.c to vfio_pci_core.c
vfio/ap_ops: Convert to use vfio_register_group_dev()
s390/vfio-ap: replace open coded locks for VFIO_GROUP_NOTIFY_SET_KVM notification
s390/vfio-ap: r/w lock for PQAP interception handler function pointer
vfio/type1: Fix vfio_find_dma_valid return
vfio-pci/zdev: Remove repeated verbose license text
vfio: platform: reset: Convert to SPDX identifier
vfio: Remove struct vfio_device_ops open/release
...
2021-09-02 20:41:33 +00:00
|
|
|
vga_set_legacy_decoding(pdev, vfio_pci_set_decode(pdev, false));
|
2021-03-30 15:53:06 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:02 +00:00
|
|
|
static void vfio_pci_vga_uninit(struct vfio_pci_core_device *vdev)
|
2021-03-30 15:53:06 +00:00
|
|
|
{
|
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
|
|
|
|
|
|
|
if (!vfio_pci_is_vga(pdev))
|
|
|
|
return;
|
VFIO update for v5.15-rc1
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching
to support future vendor provided vfio-pci variants (Yishai Hadas,
Max Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first
open, last close, and device sets (Jason Gunthorpe, Max Gurtovoy,
Yishai Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
-----BEGIN PGP SIGNATURE-----
iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmEvwWkbHGFsZXgud2ls
bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi+1UP/3CRizghroINVYR+cJ99
Tjz7lB/wlzxmRfX+SL4NAVe1SSB2VeCgU4B0PF6kywELLS8OhCO3HXYXVsz244fW
Gk5UIns86+TFTrfCOMpwYBV0P86zuaa1ZnvCnkhMK1i2pTZ+oX8hUH1Yj5clHuU+
YgC7JfEuTIAX73q2bC/llLvNE9ke1QCoDX3+HAH87ttqutnRWcnnq56PTEqwe+EW
eMA+glB1UG6JAqXxoJET4155arNOny1/ZMprfBr3YXZTiXDF/lSzuMyUtbp526Sf
hsvlnqtE6TCdfKbog0Lxckl+8E9NCq8jzFBKiZhbccrQv3vVaoP6dOsPWcT35Kp1
IjzMLiHIbl4wXOL+Xap/biz3LCM5BMdT/OhW5LUC007zggK71ndRvb9F8ptW83Bv
0Uh9DNv7YIQ0su3JHZEsJ3qPFXQXceP199UiADOGSeV8U1Qig3YKsHUDMuALfFvN
t+NleeJ4qCWao+W4VCfyDfKurVnMj/cThXiDEWEeq5gMOO+6YKBIFWJVKFxUYDbf
MgGdg0nQTUECuXKXxLD4c1HAWH9xi207OnLvhW1Icywp20MsYqOWt0vhg+PRdMBT
DK6STxP18aQxCaOuQN9Vf81LjhXNTeg+xt3mMyViOZPcKfX6/wAC9qLt4MucJDdw
FBfOz2UL2F56dhAYT+1vHoUM
=nzK7
-----END PGP SIGNATURE-----
Merge tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Fix dma-valid return WAITED implementation (Anthony Yznaga)
- SPDX license cleanups (Cai Huoqing)
- Split vfio-pci-core from vfio-pci and enhance PCI driver matching to
support future vendor provided vfio-pci variants (Yishai Hadas, Max
Gurtovoy, Jason Gunthorpe)
- Replace duplicated reflck with core support for managing first open,
last close, and device sets (Jason Gunthorpe, Max Gurtovoy, Yishai
Hadas)
- Fix non-modular mdev support and don't nag about request callback
support (Christoph Hellwig)
- Add semaphore to protect instruction intercept handler and replace
open-coded locks in vfio-ap driver (Tony Krowiak)
- Convert vfio-ap to vfio_register_group_dev() API (Jason Gunthorpe)
* tag 'vfio-v5.15-rc1' of git://github.com/awilliam/linux-vfio: (37 commits)
vfio/pci: Introduce vfio_pci_core.ko
vfio: Use kconfig if XX/endif blocks instead of repeating 'depends on'
vfio: Use select for eventfd
PCI / VFIO: Add 'override_only' support for VFIO PCI sub system
PCI: Add 'override_only' field to struct pci_device_id
vfio/pci: Move module parameters to vfio_pci.c
vfio/pci: Move igd initialization to vfio_pci.c
vfio/pci: Split the pci_driver code out of vfio_pci_core.c
vfio/pci: Include vfio header in vfio_pci_core.h
vfio/pci: Rename ops functions to fit core namings
vfio/pci: Rename vfio_pci_device to vfio_pci_core_device
vfio/pci: Rename vfio_pci_private.h to vfio_pci_core.h
vfio/pci: Rename vfio_pci.c to vfio_pci_core.c
vfio/ap_ops: Convert to use vfio_register_group_dev()
s390/vfio-ap: replace open coded locks for VFIO_GROUP_NOTIFY_SET_KVM notification
s390/vfio-ap: r/w lock for PQAP interception handler function pointer
vfio/type1: Fix vfio_find_dma_valid return
vfio-pci/zdev: Remove repeated verbose license text
vfio: platform: reset: Convert to SPDX identifier
vfio: Remove struct vfio_device_ops open/release
...
2021-09-02 20:41:33 +00:00
|
|
|
vga_client_unregister(pdev);
|
2021-03-30 15:53:06 +00:00
|
|
|
vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM |
|
|
|
|
VGA_RSRC_LEGACY_IO |
|
|
|
|
VGA_RSRC_LEGACY_MEM);
|
|
|
|
}
|
|
|
|
|
2022-09-21 10:43:48 +00:00
|
|
|
int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
|
|
|
|
|
|
|
vdev->pdev = to_pci_dev(core_vdev->dev);
|
|
|
|
vdev->irq_type = VFIO_PCI_NUM_IRQS;
|
|
|
|
mutex_init(&vdev->igate);
|
|
|
|
spin_lock_init(&vdev->irqlock);
|
|
|
|
mutex_init(&vdev->ioeventfds_lock);
|
|
|
|
INIT_LIST_HEAD(&vdev->dummy_resources_list);
|
|
|
|
INIT_LIST_HEAD(&vdev->ioeventfds_list);
|
|
|
|
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
|
|
|
|
init_rwsem(&vdev->memory_lock);
|
2023-05-11 15:44:32 +00:00
|
|
|
xa_init(&vdev->ctx);
|
2022-09-21 10:43:48 +00:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_init_dev);
|
|
|
|
|
|
|
|
void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *vdev =
|
|
|
|
container_of(core_vdev, struct vfio_pci_core_device, vdev);
|
|
|
|
|
|
|
|
mutex_destroy(&vdev->igate);
|
|
|
|
mutex_destroy(&vdev->ioeventfds_lock);
|
|
|
|
kfree(vdev->region);
|
|
|
|
kfree(vdev->pm_save);
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_release_dev);
|
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
struct device *dev = &pdev->dev;
|
2012-07-31 14:16:24 +00:00
|
|
|
int ret;
|
|
|
|
|
2022-05-11 19:19:07 +00:00
|
|
|
/* Drivers must set the vfio_pci_core_device to their drvdata */
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (WARN_ON(vdev != dev_get_drvdata(dev)))
|
2022-05-11 19:19:07 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2015-01-07 17:29:11 +00:00
|
|
|
if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL)
|
2012-07-31 14:16:24 +00:00
|
|
|
return -EINVAL;
|
|
|
|
|
2022-06-28 15:59:10 +00:00
|
|
|
if (vdev->vdev.mig_ops) {
|
|
|
|
if (!(vdev->vdev.mig_ops->migration_get_state &&
|
2022-11-06 17:46:18 +00:00
|
|
|
vdev->vdev.mig_ops->migration_set_state &&
|
|
|
|
vdev->vdev.mig_ops->migration_get_data_size) ||
|
2022-06-28 15:59:10 +00:00
|
|
|
!(vdev->vdev.migration_flags & VFIO_MIGRATION_STOP_COPY))
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
2022-09-08 18:34:43 +00:00
|
|
|
if (vdev->vdev.log_ops && !(vdev->vdev.log_ops->log_start &&
|
|
|
|
vdev->vdev.log_ops->log_stop &&
|
|
|
|
vdev->vdev.log_ops->log_read_and_clear))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2018-07-12 22:33:04 +00:00
|
|
|
/*
|
2020-03-24 15:28:28 +00:00
|
|
|
* Prevent binding to PFs with VFs enabled, the VFs might be in use
|
|
|
|
* by the host or other users. We cannot capture the VFs if they
|
|
|
|
* already exist, nor can we track VF users. Disabling SR-IOV here
|
|
|
|
* would initiate removing the VFs, which would unbind the driver,
|
|
|
|
* which is prone to blocking if that VF is also in use by vfio-pci.
|
|
|
|
* Just reject these PFs and let the user sort it out.
|
2018-07-12 22:33:04 +00:00
|
|
|
*/
|
|
|
|
if (pci_num_vf(pdev)) {
|
|
|
|
pci_warn(pdev, "Cannot bind to PF with SR-IOV enabled\n");
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
|
2021-08-06 01:19:04 +00:00
|
|
|
if (pci_is_root_bus(pdev->bus)) {
|
|
|
|
ret = vfio_assign_device_set(&vdev->vdev, vdev);
|
|
|
|
} else if (!pci_probe_reset_slot(pdev->slot)) {
|
|
|
|
ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* If there is no slot reset support for this device, the whole
|
|
|
|
* bus needs to be grouped together to support bus-wide resets.
|
|
|
|
*/
|
|
|
|
ret = vfio_assign_device_set(&vdev->vdev, pdev->bus);
|
|
|
|
}
|
|
|
|
|
2020-03-24 15:28:29 +00:00
|
|
|
if (ret)
|
2021-09-24 15:56:51 +00:00
|
|
|
return ret;
|
2021-03-30 15:53:06 +00:00
|
|
|
ret = vfio_pci_vf_init(vdev);
|
2020-03-24 15:28:29 +00:00
|
|
|
if (ret)
|
2021-09-24 15:56:51 +00:00
|
|
|
return ret;
|
2021-03-30 15:53:06 +00:00
|
|
|
ret = vfio_pci_vga_init(vdev);
|
|
|
|
if (ret)
|
|
|
|
goto out_vf;
|
2015-04-07 17:14:41 +00:00
|
|
|
|
2019-02-09 20:43:30 +00:00
|
|
|
vfio_pci_probe_power_state(vdev);
|
|
|
|
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
/*
|
|
|
|
* pci-core sets the device power state to an unknown value at
|
|
|
|
* bootup and after being removed from a driver. The only
|
|
|
|
* transition it allows from this unknown state is to D0, which
|
|
|
|
* typically happens when a driver calls pci_enable_device().
|
|
|
|
* We're not ready to enable the device yet, but we do want to
|
|
|
|
* be able to get to D3. Therefore first do a D0 transition
|
|
|
|
* before enabling runtime PM.
|
|
|
|
*/
|
|
|
|
vfio_pci_set_power_state(vdev, PCI_D0);
|
|
|
|
|
|
|
|
dev->driver->pm = &vfio_pci_core_pm_ops;
|
|
|
|
pm_runtime_allow(dev);
|
|
|
|
if (!disable_idle_d3)
|
|
|
|
pm_runtime_put(dev);
|
2015-04-07 17:14:46 +00:00
|
|
|
|
2021-03-30 15:53:07 +00:00
|
|
|
ret = vfio_register_group_dev(&vdev->vdev);
|
2021-03-30 15:53:06 +00:00
|
|
|
if (ret)
|
|
|
|
goto out_power;
|
|
|
|
return 0;
|
2020-03-24 15:28:29 +00:00
|
|
|
|
2021-03-30 15:53:06 +00:00
|
|
|
out_power:
|
|
|
|
if (!disable_idle_d3)
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
pm_runtime_get_noresume(dev);
|
|
|
|
|
|
|
|
pm_runtime_forbid(dev);
|
2021-03-30 15:53:06 +00:00
|
|
|
out_vf:
|
|
|
|
vfio_pci_vf_uninit(vdev);
|
2020-03-24 15:28:29 +00:00
|
|
|
return ret;
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_register_device);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
2022-05-11 19:19:07 +00:00
|
|
|
vfio_pci_core_sriov_configure(vdev, 0);
|
2020-03-24 15:28:28 +00:00
|
|
|
|
2021-03-30 15:53:07 +00:00
|
|
|
vfio_unregister_group_dev(&vdev->vdev);
|
2020-03-24 15:28:28 +00:00
|
|
|
|
2021-03-30 15:53:06 +00:00
|
|
|
vfio_pci_vf_uninit(vdev);
|
|
|
|
vfio_pci_vga_uninit(vdev);
|
2018-12-12 19:51:07 +00:00
|
|
|
|
2019-02-09 20:43:30 +00:00
|
|
|
if (!disable_idle_d3)
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
pm_runtime_get_noresume(&vdev->pdev->dev);
|
|
|
|
|
|
|
|
pm_runtime_forbid(&vdev->pdev->dev);
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_unregister_device);
|
2012-07-31 14:16:24 +00:00
|
|
|
|
2022-02-24 14:20:23 +00:00
|
|
|
pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
|
|
|
|
pci_channel_state_t state)
|
2013-03-11 15:31:22 +00:00
|
|
|
{
|
2022-05-11 19:19:07 +00:00
|
|
|
struct vfio_pci_core_device *vdev = dev_get_drvdata(&pdev->dev);
|
2013-03-11 15:31:22 +00:00
|
|
|
|
2014-01-14 23:12:55 +00:00
|
|
|
mutex_lock(&vdev->igate);
|
|
|
|
|
2013-03-11 15:31:22 +00:00
|
|
|
if (vdev->err_trigger)
|
2023-11-22 12:48:23 +00:00
|
|
|
eventfd_signal(vdev->err_trigger);
|
2013-03-11 15:31:22 +00:00
|
|
|
|
2014-01-14 23:12:55 +00:00
|
|
|
mutex_unlock(&vdev->igate);
|
|
|
|
|
2013-03-11 15:31:22 +00:00
|
|
|
return PCI_ERS_RESULT_CAN_RECOVER;
|
|
|
|
}
|
2022-02-24 14:20:23 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_aer_err_detected);
|
2013-03-11 15:31:22 +00:00
|
|
|
|
2022-05-11 19:19:07 +00:00
|
|
|
int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
|
|
|
|
int nr_virtfn)
|
2020-03-24 15:28:28 +00:00
|
|
|
{
|
2022-05-11 19:19:07 +00:00
|
|
|
struct pci_dev *pdev = vdev->pdev;
|
2020-03-24 15:28:28 +00:00
|
|
|
int ret = 0;
|
|
|
|
|
2022-04-13 13:10:36 +00:00
|
|
|
device_lock_assert(&pdev->dev);
|
|
|
|
|
|
|
|
if (nr_virtfn) {
|
|
|
|
mutex_lock(&vfio_pci_sriov_pfs_mutex);
|
|
|
|
/*
|
|
|
|
* The thread that adds the vdev to the list is the only thread
|
|
|
|
* that gets to call pci_enable_sriov() and we will only allow
|
|
|
|
* it to be called once without going through
|
|
|
|
* pci_disable_sriov()
|
|
|
|
*/
|
|
|
|
if (!list_empty(&vdev->sriov_pfs_item)) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
list_add_tail(&vdev->sriov_pfs_item, &vfio_pci_sriov_pfs);
|
|
|
|
mutex_unlock(&vfio_pci_sriov_pfs_mutex);
|
vfio/pci: Change the PF power state to D0 before enabling VFs
According to [PCIe v5 9.6.2] for PF Device Power Management States
"The PF's power management state (D-state) has global impact on its
associated VFs. If a VF does not implement the Power Management
Capability, then it behaves as if it is in an equivalent
power state of its associated PF.
If a VF implements the Power Management Capability, the Device behavior
is undefined if the PF is placed in a lower power state than the VF.
Software should avoid this situation by placing all VFs in lower power
state before lowering their associated PF's power state."
From the vfio driver side, user can enable SR-IOV when the PF is in D3hot
state. If VF does not implement the Power Management Capability, then
the VF will be actually in D3hot state and then the VF BAR access will
fail. If VF implements the Power Management Capability, then VF will
assume that its current power state is D0 when the PF is D3hot and
in this case, the behavior is undefined.
To support PF power management, we need to create power management
dependency between PF and its VF's. The runtime power management support
may help with this where power management dependencies are supported
through device links. But till we have such support in place, we can
disallow the PF to go into low power state, if PF has VF enabled.
There can be a case, where user first enables the VF's and then
disables the VF's. If there is no user of PF, then the PF can put into
D3hot state again. But with this patch, the PF will still be in D0
state after disabling VF's since detecting this case inside
vfio_pci_core_sriov_configure() requires access to
struct vfio_device::open_count along with its locks. But the subsequent
patches related to runtime PM will handle this case since runtime PM
maintains its own usage count.
Also, vfio_pci_core_sriov_configure() can be called at any time
(with and without vfio pci device user), so the power state change
and SR-IOV enablement need to be protected with the required locks.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:10 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The PF power state should always be higher than the VF power
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
* state. The PF can be in low power state either with runtime
|
|
|
|
* power management (when there is no user) or PCI_PM_CTRL
|
|
|
|
* register write by the user. If PF is in the low power state,
|
|
|
|
* then change the power state to D0 first before enabling
|
|
|
|
* SR-IOV. Also, this function can be called at any time, and
|
|
|
|
* userspace PCI_PM_CTRL write can race against this code path,
|
vfio/pci: Change the PF power state to D0 before enabling VFs
According to [PCIe v5 9.6.2] for PF Device Power Management States
"The PF's power management state (D-state) has global impact on its
associated VFs. If a VF does not implement the Power Management
Capability, then it behaves as if it is in an equivalent
power state of its associated PF.
If a VF implements the Power Management Capability, the Device behavior
is undefined if the PF is placed in a lower power state than the VF.
Software should avoid this situation by placing all VFs in lower power
state before lowering their associated PF's power state."
From the vfio driver side, user can enable SR-IOV when the PF is in D3hot
state. If VF does not implement the Power Management Capability, then
the VF will be actually in D3hot state and then the VF BAR access will
fail. If VF implements the Power Management Capability, then VF will
assume that its current power state is D0 when the PF is D3hot and
in this case, the behavior is undefined.
To support PF power management, we need to create power management
dependency between PF and its VF's. The runtime power management support
may help with this where power management dependencies are supported
through device links. But till we have such support in place, we can
disallow the PF to go into low power state, if PF has VF enabled.
There can be a case, where user first enables the VF's and then
disables the VF's. If there is no user of PF, then the PF can put into
D3hot state again. But with this patch, the PF will still be in D0
state after disabling VF's since detecting this case inside
vfio_pci_core_sriov_configure() requires access to
struct vfio_device::open_count along with its locks. But the subsequent
patches related to runtime PM will handle this case since runtime PM
maintains its own usage count.
Also, vfio_pci_core_sriov_configure() can be called at any time
(with and without vfio pci device user), so the power state change
and SR-IOV enablement need to be protected with the required locks.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:10 +00:00
|
|
|
* so protect the same with 'memory_lock'.
|
|
|
|
*/
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
ret = pm_runtime_resume_and_get(&pdev->dev);
|
|
|
|
if (ret)
|
|
|
|
goto out_del;
|
|
|
|
|
vfio/pci: Change the PF power state to D0 before enabling VFs
According to [PCIe v5 9.6.2] for PF Device Power Management States
"The PF's power management state (D-state) has global impact on its
associated VFs. If a VF does not implement the Power Management
Capability, then it behaves as if it is in an equivalent
power state of its associated PF.
If a VF implements the Power Management Capability, the Device behavior
is undefined if the PF is placed in a lower power state than the VF.
Software should avoid this situation by placing all VFs in lower power
state before lowering their associated PF's power state."
From the vfio driver side, user can enable SR-IOV when the PF is in D3hot
state. If VF does not implement the Power Management Capability, then
the VF will be actually in D3hot state and then the VF BAR access will
fail. If VF implements the Power Management Capability, then VF will
assume that its current power state is D0 when the PF is D3hot and
in this case, the behavior is undefined.
To support PF power management, we need to create power management
dependency between PF and its VF's. The runtime power management support
may help with this where power management dependencies are supported
through device links. But till we have such support in place, we can
disallow the PF to go into low power state, if PF has VF enabled.
There can be a case, where user first enables the VF's and then
disables the VF's. If there is no user of PF, then the PF can put into
D3hot state again. But with this patch, the PF will still be in D0
state after disabling VF's since detecting this case inside
vfio_pci_core_sriov_configure() requires access to
struct vfio_device::open_count along with its locks. But the subsequent
patches related to runtime PM will handle this case since runtime PM
maintains its own usage count.
Also, vfio_pci_core_sriov_configure() can be called at any time
(with and without vfio pci device user), so the power state change
and SR-IOV enablement need to be protected with the required locks.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:10 +00:00
|
|
|
down_write(&vdev->memory_lock);
|
|
|
|
vfio_pci_set_power_state(vdev, PCI_D0);
|
2020-03-24 15:28:28 +00:00
|
|
|
ret = pci_enable_sriov(pdev, nr_virtfn);
|
vfio/pci: Change the PF power state to D0 before enabling VFs
According to [PCIe v5 9.6.2] for PF Device Power Management States
"The PF's power management state (D-state) has global impact on its
associated VFs. If a VF does not implement the Power Management
Capability, then it behaves as if it is in an equivalent
power state of its associated PF.
If a VF implements the Power Management Capability, the Device behavior
is undefined if the PF is placed in a lower power state than the VF.
Software should avoid this situation by placing all VFs in lower power
state before lowering their associated PF's power state."
From the vfio driver side, user can enable SR-IOV when the PF is in D3hot
state. If VF does not implement the Power Management Capability, then
the VF will be actually in D3hot state and then the VF BAR access will
fail. If VF implements the Power Management Capability, then VF will
assume that its current power state is D0 when the PF is D3hot and
in this case, the behavior is undefined.
To support PF power management, we need to create power management
dependency between PF and its VF's. The runtime power management support
may help with this where power management dependencies are supported
through device links. But till we have such support in place, we can
disallow the PF to go into low power state, if PF has VF enabled.
There can be a case, where user first enables the VF's and then
disables the VF's. If there is no user of PF, then the PF can put into
D3hot state again. But with this patch, the PF will still be in D0
state after disabling VF's since detecting this case inside
vfio_pci_core_sriov_configure() requires access to
struct vfio_device::open_count along with its locks. But the subsequent
patches related to runtime PM will handle this case since runtime PM
maintains its own usage count.
Also, vfio_pci_core_sriov_configure() can be called at any time
(with and without vfio pci device user), so the power state change
and SR-IOV enablement need to be protected with the required locks.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:10 +00:00
|
|
|
up_write(&vdev->memory_lock);
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (ret) {
|
|
|
|
pm_runtime_put(&pdev->dev);
|
2022-04-13 13:10:36 +00:00
|
|
|
goto out_del;
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
}
|
2022-05-11 19:19:07 +00:00
|
|
|
return nr_virtfn;
|
2022-04-13 13:10:36 +00:00
|
|
|
}
|
2020-03-24 15:28:28 +00:00
|
|
|
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (pci_num_vf(pdev)) {
|
|
|
|
pci_disable_sriov(pdev);
|
|
|
|
pm_runtime_put(&pdev->dev);
|
|
|
|
}
|
2020-03-24 15:28:28 +00:00
|
|
|
|
2022-04-13 13:10:36 +00:00
|
|
|
out_del:
|
|
|
|
mutex_lock(&vfio_pci_sriov_pfs_mutex);
|
|
|
|
list_del_init(&vdev->sriov_pfs_item);
|
|
|
|
out_unlock:
|
|
|
|
mutex_unlock(&vfio_pci_sriov_pfs_mutex);
|
|
|
|
return ret;
|
2020-03-24 15:28:28 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_sriov_configure);
|
2020-03-24 15:28:28 +00:00
|
|
|
|
2021-08-26 10:39:05 +00:00
|
|
|
const struct pci_error_handlers vfio_pci_core_err_handlers = {
|
2022-02-24 14:20:23 +00:00
|
|
|
.error_detected = vfio_pci_core_aer_err_detected,
|
2013-03-11 15:31:22 +00:00
|
|
|
};
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers);
|
2013-03-11 15:31:22 +00:00
|
|
|
|
2023-07-18 10:55:42 +00:00
|
|
|
static bool vfio_dev_in_groups(struct vfio_device *vdev,
|
2021-08-06 01:19:06 +00:00
|
|
|
struct vfio_pci_group_info *groups)
|
2014-08-07 17:12:07 +00:00
|
|
|
{
|
2021-08-06 01:19:06 +00:00
|
|
|
unsigned int i;
|
2014-08-07 17:12:07 +00:00
|
|
|
|
2023-07-18 10:55:42 +00:00
|
|
|
if (!groups)
|
|
|
|
return false;
|
|
|
|
|
2021-08-06 01:19:06 +00:00
|
|
|
for (i = 0; i < groups->count; i++)
|
2023-07-18 10:55:42 +00:00
|
|
|
if (vfio_file_has_dev(groups->files[i], vdev))
|
2021-08-06 01:19:06 +00:00
|
|
|
return true;
|
|
|
|
return false;
|
2014-08-07 17:12:07 +00:00
|
|
|
}
|
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data)
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
{
|
2021-08-06 01:19:05 +00:00
|
|
|
struct vfio_device_set *dev_set = data;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2023-07-18 10:55:39 +00:00
|
|
|
return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV;
|
2021-08-06 01:19:05 +00:00
|
|
|
}
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
/*
|
|
|
|
* vfio-core considers a group to be viable and will create a vfio_device even
|
|
|
|
* if some devices are bound to drivers like pci-stub or pcieport. Here we
|
|
|
|
* require all PCI devices to be inside our dev_set since that ensures they stay
|
|
|
|
* put and that every driver controlling the device can co-ordinate with the
|
|
|
|
* device reset.
|
|
|
|
*
|
|
|
|
* Returns the pci_dev to pass to pci_reset_bus() if every PCI device to be
|
|
|
|
* reset is inside the dev_set, and pci_reset_bus() can succeed. NULL otherwise.
|
|
|
|
*/
|
|
|
|
static struct pci_dev *
|
|
|
|
vfio_pci_dev_set_resettable(struct vfio_device_set *dev_set)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
lockdep_assert_held(&dev_set->lock);
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
|
|
|
/*
|
2021-08-06 01:19:05 +00:00
|
|
|
* By definition all PCI devices in the dev_set share the same PCI
|
|
|
|
* reset, so any pci_dev will have the same outcomes for
|
|
|
|
* pci_probe_reset_*() and pci_reset_bus().
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
*/
|
2021-08-26 10:39:02 +00:00
|
|
|
pdev = list_first_entry(&dev_set->device_list,
|
|
|
|
struct vfio_pci_core_device,
|
2021-08-06 01:19:05 +00:00
|
|
|
vdev.dev_set_list)->pdev;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
/* pci_reset_bus() is supported */
|
|
|
|
if (pci_probe_reset_slot(pdev->slot) && pci_probe_reset_bus(pdev->bus))
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (vfio_pci_for_each_slot_or_bus(pdev, vfio_pci_is_device_in_set,
|
|
|
|
dev_set,
|
|
|
|
!pci_probe_reset_slot(pdev->slot)))
|
|
|
|
return NULL;
|
|
|
|
return pdev;
|
|
|
|
}
|
|
|
|
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
|
|
|
|
{
|
|
|
|
struct vfio_pci_core_device *cur;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
|
|
|
|
ret = pm_runtime_resume_and_get(&cur->pdev->dev);
|
|
|
|
if (ret)
|
|
|
|
goto unwind;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
unwind:
|
|
|
|
list_for_each_entry_continue_reverse(cur, &dev_set->device_list,
|
|
|
|
vdev.dev_set_list)
|
|
|
|
pm_runtime_put(&cur->pdev->dev);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-08-06 01:19:06 +00:00
|
|
|
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
|
2023-07-18 10:55:42 +00:00
|
|
|
struct vfio_pci_group_info *groups,
|
|
|
|
struct iommufd_ctx *iommufd_ctx)
|
2021-08-06 01:19:06 +00:00
|
|
|
{
|
2024-05-30 04:52:31 +00:00
|
|
|
struct vfio_pci_core_device *vdev;
|
2021-08-06 01:19:06 +00:00
|
|
|
struct pci_dev *pdev;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
mutex_lock(&dev_set->lock);
|
|
|
|
|
|
|
|
pdev = vfio_pci_dev_set_resettable(dev_set);
|
|
|
|
if (!pdev) {
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto err_unlock;
|
|
|
|
}
|
|
|
|
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
/*
|
|
|
|
* Some of the devices in the dev_set can be in the runtime suspended
|
|
|
|
* state. Increment the usage count for all the devices in the dev_set
|
|
|
|
* before reset and decrement the same after reset.
|
|
|
|
*/
|
|
|
|
ret = vfio_pci_dev_set_pm_runtime_get(dev_set);
|
|
|
|
if (ret)
|
|
|
|
goto err_unlock;
|
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
|
2023-07-18 10:55:42 +00:00
|
|
|
bool owned;
|
|
|
|
|
2021-08-06 01:19:06 +00:00
|
|
|
/*
|
2023-07-18 10:55:42 +00:00
|
|
|
* Test whether all the affected devices can be reset by the
|
|
|
|
* user.
|
|
|
|
*
|
|
|
|
* If called from a group opened device and the user provides
|
|
|
|
* a set of groups, all the devices in the dev_set should be
|
|
|
|
* contained by the set of groups provided by the user.
|
|
|
|
*
|
|
|
|
* If called from a cdev opened device and the user provides
|
|
|
|
* a zero-length array, all the devices in the dev_set must
|
|
|
|
* be bound to the same iommufd_ctx as the input iommufd_ctx.
|
|
|
|
* If there is any device that has not been bound to any
|
|
|
|
* iommufd_ctx yet, check if its iommu_group has any device
|
|
|
|
* bound to the input iommufd_ctx. Such devices can be
|
|
|
|
* considered owned by the input iommufd_ctx as the device
|
|
|
|
* cannot be owned by another iommufd_ctx when its iommu_group
|
|
|
|
* is owned.
|
|
|
|
*
|
|
|
|
* Otherwise, reset is not allowed.
|
2021-08-06 01:19:06 +00:00
|
|
|
*/
|
2023-07-18 10:55:42 +00:00
|
|
|
if (iommufd_ctx) {
|
2024-05-30 04:52:31 +00:00
|
|
|
int devid = vfio_iommufd_get_dev_id(&vdev->vdev,
|
2023-07-18 10:55:42 +00:00
|
|
|
iommufd_ctx);
|
|
|
|
|
|
|
|
owned = (devid > 0 || devid == -ENOENT);
|
|
|
|
} else {
|
2024-05-30 04:52:31 +00:00
|
|
|
owned = vfio_dev_in_groups(&vdev->vdev, groups);
|
2023-07-18 10:55:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!owned) {
|
2021-08-06 01:19:06 +00:00
|
|
|
ret = -EINVAL;
|
2024-05-30 04:52:31 +00:00
|
|
|
break;
|
2021-08-06 01:19:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2024-05-30 04:52:31 +00:00
|
|
|
* Take the memory write lock for each device and zap BAR
|
|
|
|
* mappings to prevent the user accessing the device while in
|
|
|
|
* reset. Locking multiple devices is prone to deadlock,
|
|
|
|
* runaway and unwind if we hit contention.
|
2021-08-06 01:19:06 +00:00
|
|
|
*/
|
2024-05-30 04:52:31 +00:00
|
|
|
if (!down_write_trylock(&vdev->memory_lock)) {
|
2021-08-06 01:19:06 +00:00
|
|
|
ret = -EBUSY;
|
2024-05-30 04:52:31 +00:00
|
|
|
break;
|
2021-08-06 01:19:06 +00:00
|
|
|
}
|
2024-05-30 04:52:31 +00:00
|
|
|
|
|
|
|
vfio_pci_zap_bars(vdev);
|
2021-08-06 01:19:06 +00:00
|
|
|
}
|
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
if (!list_entry_is_head(vdev,
|
|
|
|
&dev_set->device_list, vdev.dev_set_list)) {
|
|
|
|
vdev = list_prev_entry(vdev, vdev.dev_set_list);
|
|
|
|
goto err_undo;
|
2021-08-06 01:19:06 +00:00
|
|
|
}
|
|
|
|
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
/*
|
|
|
|
* The pci_reset_bus() will reset all the devices in the bus.
|
|
|
|
* The power state can be non-D0 for some of the devices in the bus.
|
|
|
|
* For these devices, the pci_reset_bus() will internally set
|
|
|
|
* the power state to D0 without vfio driver involvement.
|
|
|
|
* For the devices which have NoSoftRst-, the reset function can
|
|
|
|
* cause the PCI config space reset without restoring the original
|
|
|
|
* state (saved locally in 'vdev->pm_save').
|
|
|
|
*/
|
2024-05-30 04:52:31 +00:00
|
|
|
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
|
|
|
|
vfio_pci_set_power_state(vdev, PCI_D0);
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
|
2021-08-06 01:19:06 +00:00
|
|
|
ret = pci_reset_bus(pdev);
|
|
|
|
|
2024-05-30 04:52:31 +00:00
|
|
|
vdev = list_last_entry(&dev_set->device_list,
|
|
|
|
struct vfio_pci_core_device, vdev.dev_set_list);
|
|
|
|
|
2021-08-06 01:19:06 +00:00
|
|
|
err_undo:
|
2024-05-30 04:52:31 +00:00
|
|
|
list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
|
|
|
|
vdev.dev_set_list)
|
|
|
|
up_write(&vdev->memory_lock);
|
|
|
|
|
|
|
|
list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
|
|
|
|
pm_runtime_put(&vdev->pdev->dev);
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY/EXIT
Currently, if the runtime power management is enabled for vfio-pci
based devices in the guest OS, then the guest OS will do the register
write for PCI_PM_CTRL register. This write request will be handled in
vfio_pm_config_write() where it will do the actual register write of
PCI_PM_CTRL register. With this, the maximum D3hot state can be
achieved for low power. If we can use the runtime PM framework, then
we can achieve the D3cold state (on the supported systems) which will
help in saving maximum power.
1. D3cold state can't be achieved by writing PCI standard
PM config registers. This patch implements the following
newly added low power related device features:
- VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY
- VFIO_DEVICE_FEATURE_LOW_POWER_EXIT
The VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY feature will allow the
device to make use of low power platform states on the host
while the VFIO_DEVICE_FEATURE_LOW_POWER_EXIT will prevent
further use of those power states.
2. The vfio-pci driver uses runtime PM framework for low power entry and
exit. On the platforms where D3cold state is supported, the runtime
PM framework will put the device into D3cold otherwise, D3hot or some
other power state will be used.
There are various cases where the device will not go into the runtime
suspended state. For example,
- The runtime power management is disabled on the host side for
the device.
- The user keeps the device busy after calling LOW_POWER_ENTRY.
- There are dependent devices that are still in runtime active state.
For these cases, the device will be in the same power state that has
been configured by the user through PCI_PM_CTRL register.
3. The hypervisors can implement virtual ACPI methods. For example,
in guest linux OS if PCI device ACPI node has _PR3 and _PR0 power
resources with _ON/_OFF method, then guest linux OS invokes
the _OFF method during D3cold transition and then _ON during D0
transition. The hypervisor can tap these virtual ACPI calls and then
call the low power device feature IOCTL.
4. The 'pm_runtime_engaged' flag tracks the entry and exit to
runtime PM. This flag is protected with 'memory_lock' semaphore.
5. All the config and other region access are wrapped under
pm_runtime_resume_and_get() and pm_runtime_put(). So, if any
device access happens while the device is in the runtime suspended
state, then the device will be resumed first before access. Once the
access has been finished, then the device will again go into the
runtime suspended state.
6. The memory region access through mmap will not be allowed in the low
power state. Since __vfio_pci_memory_enabled() is a common function,
so check for 'pm_runtime_engaged' has been added explicitly in
vfio_pci_mmap_fault() to block only mmap'ed access.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 11:48:49 +00:00
|
|
|
|
2021-08-06 01:19:06 +00:00
|
|
|
err_unlock:
|
|
|
|
mutex_unlock(&dev_set->lock);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set)
|
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *cur;
|
2021-08-06 01:19:05 +00:00
|
|
|
bool needs_reset = false;
|
|
|
|
|
2022-11-10 01:40:27 +00:00
|
|
|
/* No other VFIO device in the set can be open. */
|
|
|
|
if (vfio_device_set_open_count(dev_set) > 1)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
|
2021-08-06 01:19:05 +00:00
|
|
|
needs_reset |= cur->needs_reset;
|
|
|
|
return needs_reset;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 19:48:11 +00:00
|
|
|
}
|
|
|
|
|
2014-08-07 17:12:07 +00:00
|
|
|
/*
|
2021-08-06 01:19:05 +00:00
|
|
|
* If a bus or slot reset is available for the provided dev_set and:
|
2018-12-12 19:51:07 +00:00
|
|
|
* - All of the devices affected by that bus or slot reset are unused
|
|
|
|
* - At least one of the affected devices is marked dirty via
|
|
|
|
* needs_reset (such as by lack of FLR support)
|
2021-08-06 01:19:05 +00:00
|
|
|
* Then attempt to perform that bus or slot reset.
|
2014-08-07 17:12:07 +00:00
|
|
|
*/
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set)
|
2014-08-07 17:12:07 +00:00
|
|
|
{
|
2021-08-26 10:39:02 +00:00
|
|
|
struct vfio_pci_core_device *cur;
|
2021-08-06 01:19:05 +00:00
|
|
|
struct pci_dev *pdev;
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
bool reset_done = false;
|
2014-09-29 23:18:39 +00:00
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
if (!vfio_pci_dev_set_needs_reset(dev_set))
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
return;
|
2018-12-12 19:51:07 +00:00
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
pdev = vfio_pci_dev_set_resettable(dev_set);
|
|
|
|
if (!pdev)
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
return;
|
2015-04-07 17:14:46 +00:00
|
|
|
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
/*
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
* Some of the devices in the bus can be in the runtime suspended
|
|
|
|
* state. Increment the usage count for all the devices in the dev_set
|
|
|
|
* before reset and decrement the same after reset.
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
*/
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (!disable_idle_d3 && vfio_pci_dev_set_pm_runtime_get(dev_set))
|
|
|
|
return;
|
vfio/pci: wake-up devices around reset functions
If 'vfio_pci_core_device::needs_pm_restore' is set (PCI device does
not have No_Soft_Reset bit set in its PMCSR config register), then the
current PCI state will be saved locally in
'vfio_pci_core_device::pm_save' during D0->D3hot transition and same
will be restored back during D3hot->D0 transition. For reset-related
functionalities, vfio driver uses PCI reset API's. These
API's internally change the PCI power state back to D0 first if
the device power state is non-D0. This state change to D0 will happen
without the involvement of vfio driver.
Let's consider the following example:
1. The device is in D3hot.
2. User invokes VFIO_DEVICE_RESET ioctl.
3. pci_try_reset_function() will be called which internally
invokes pci_dev_save_and_disable().
4. pci_set_power_state(dev, PCI_D0) will be called first.
5. pci_save_state() will happen then.
Now, for the devices which has NoSoftRst-, the pci_set_power_state()
can trigger soft reset and the original PCI config state will be lost
at step (4) and this state cannot be restored again. This original PCI
state can include any setting which is performed by SBIOS or host
linux kernel (for example LTR, ASPM L1 substates, etc.). When this
soft reset will be triggered, then all these settings will be reset,
and the device state saved at step (5) will also have this setting
cleared so it cannot be restored. Since the vfio driver only exposes
limited PCI capabilities to its user, so the vfio driver user also
won't have the option to save and restore these capabilities state
either and these original settings will be permanently lost.
For pci_reset_bus() also, we can have the above situation.
The other functions/devices can be in D3hot and the reset will change
the power state of all devices to D0 without the involvement of vfio
driver.
So, before calling any reset-related API's, we need to make sure that
the device state is D0. This is mainly to preserve the state around
soft reset.
For vfio_pci_core_disable(), we use __pci_reset_function_locked()
which internally can use pci_pm_reset() for the function reset.
pci_pm_reset() requires the device power state to be in D0, otherwise
it returns error.
This patch changes the device power state to D0 by invoking
vfio_pci_set_power_state() explicitly before calling any reset related
API's.
Fixes: 51ef3a004b1e ("vfio/pci: Restore device state on PM transition")
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220217122107.22434-3-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-02-17 12:21:07 +00:00
|
|
|
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (!pci_reset_bus(pdev))
|
|
|
|
reset_done = true;
|
2015-04-07 17:14:46 +00:00
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
if (reset_done)
|
|
|
|
cur->needs_reset = false;
|
|
|
|
|
2021-08-06 01:19:05 +00:00
|
|
|
if (!disable_idle_d3)
|
vfio/pci: Move the unused device into low power state with runtime PM
Currently, there is very limited power management support
available in the upstream vfio_pci_core based drivers. If there
are no users of the device, then the PCI device will be moved into
D3hot state by writing directly into PCI PM registers. This D3hot
state help in saving power but we can achieve zero power consumption
if we go into the D3cold state. The D3cold state cannot be possible
with native PCI PM. It requires interaction with platform firmware
which is system-specific. To go into low power states (including D3cold),
the runtime PM framework can be used which internally interacts with PCI
and platform firmware and puts the device into the lowest possible
D-States.
This patch registers vfio_pci_core based drivers with the
runtime PM framework.
1. The PCI core framework takes care of most of the runtime PM
related things. For enabling the runtime PM, the PCI driver needs to
decrement the usage count and needs to provide 'struct dev_pm_ops'
at least. The runtime suspend/resume callbacks are optional and needed
only if we need to do any extra handling. Now there are multiple
vfio_pci_core based drivers. Instead of assigning the
'struct dev_pm_ops' in individual parent driver, the vfio_pci_core
itself assigns the 'struct dev_pm_ops'. There are other drivers where
the 'struct dev_pm_ops' is being assigned inside core layer
(For example, wlcore_probe() and some sound based driver, etc.).
2. This patch provides the stub implementation of 'struct dev_pm_ops'.
The subsequent patch will provide the runtime suspend/resume
callbacks. All the config state saving, and PCI power management
related things will be done by PCI core framework itself inside its
runtime suspend/resume callbacks (pci_pm_runtime_suspend() and
pci_pm_runtime_resume()).
3. Inside pci_reset_bus(), all the devices in dev_set needs to be
runtime resumed. vfio_pci_dev_set_pm_runtime_get() will take
care of the runtime resume and its error handling.
4. Inside vfio_pci_core_disable(), the device usage count always needs
to be decremented which was incremented in vfio_pci_core_enable().
5. Since the runtime PM framework will provide the same functionality,
so directly writing into PCI PM config register can be replaced with
the use of runtime PM routines. Also, the use of runtime PM can help
us in more power saving.
In the systems which do not support D3cold,
With the existing implementation:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3hot
So, with runtime PM, the upstream bridge or root port will also go
into lower power state which is not possible with existing
implementation.
In the systems which support D3cold,
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3hot
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D0
With runtime PM:
// PCI device
# cat /sys/bus/pci/devices/0000\:01\:00.0/power_state
D3cold
// upstream bridge
# cat /sys/bus/pci/devices/0000\:00\:01.0/power_state
D3cold
So, with runtime PM, both the PCI device and upstream bridge will
go into D3cold state.
6. If 'disable_idle_d3' module parameter is set, then also the runtime
PM will be enabled, but in this case, the usage count should not be
decremented.
7. vfio_pci_dev_set_try_reset() return value is unused now, so this
function return type can be changed to void.
8. Use the runtime PM API's in vfio_pci_core_sriov_configure().
The device can be in low power state either with runtime
power management (when there is no user) or PCI_PM_CTRL register
write by the user. In both the cases, the PF should be moved to
D0 state. For preventing any runtime usage mismatch, pci_num_vf()
has been called explicitly during disable.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220518111612.16985-5-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-05-18 11:16:12 +00:00
|
|
|
pm_runtime_put(&cur->pdev->dev);
|
2014-09-29 23:18:39 +00:00
|
|
|
}
|
2014-08-07 17:12:07 +00:00
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:07 +00:00
|
|
|
void vfio_pci_core_set_params(bool is_nointxmask, bool is_disable_vga,
|
|
|
|
bool is_disable_idle_d3)
|
|
|
|
{
|
|
|
|
nointxmask = is_nointxmask;
|
|
|
|
disable_vga = is_disable_vga;
|
|
|
|
disable_idle_d3 = is_disable_idle_d3;
|
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
EXPORT_SYMBOL_GPL(vfio_pci_core_set_params);
|
2021-08-26 10:39:07 +00:00
|
|
|
|
2021-08-26 10:39:12 +00:00
|
|
|
static void vfio_pci_core_cleanup(void)
|
2012-07-31 14:16:24 +00:00
|
|
|
{
|
|
|
|
vfio_pci_uninit_perm_bits();
|
|
|
|
}
|
|
|
|
|
2021-08-26 10:39:12 +00:00
|
|
|
static int __init vfio_pci_core_init(void)
|
2015-04-07 17:14:43 +00:00
|
|
|
{
|
2021-03-30 15:54:21 +00:00
|
|
|
/* Allocate shared config space permission data used by all devices */
|
2021-08-26 10:39:05 +00:00
|
|
|
return vfio_pci_init_perm_bits();
|
2012-07-31 14:16:24 +00:00
|
|
|
}
|
2021-08-26 10:39:12 +00:00
|
|
|
|
|
|
|
module_init(vfio_pci_core_init);
|
|
|
|
module_exit(vfio_pci_core_cleanup);
|
|
|
|
|
|
|
|
MODULE_LICENSE("GPL v2");
|
|
|
|
MODULE_AUTHOR(DRIVER_AUTHOR);
|
|
|
|
MODULE_DESCRIPTION(DRIVER_DESC);
|