2019-06-04 10:11:33 +02:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-only */
|
2012-07-31 08:16:24 -06:00
|
|
|
/*
|
|
|
|
* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
|
|
|
|
* Author: Alex Williamson <alex.williamson@redhat.com>
|
|
|
|
*
|
|
|
|
* Derived from original vfio:
|
|
|
|
* Copyright 2010 Cisco Systems, Inc. All rights reserved.
|
|
|
|
* Author: Tom Lyon, pugs@cisco.com
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/mutex.h>
|
|
|
|
#include <linux/pci.h>
|
2021-08-26 13:39:04 +03:00
|
|
|
#include <linux/vfio.h>
|
2015-09-18 22:29:50 +08:00
|
|
|
#include <linux/irqbypass.h>
|
2016-02-22 16:02:39 -07:00
|
|
|
#include <linux/types.h>
|
vfio/pci: Introduce VF token
If we enable SR-IOV on a vfio-pci owned PF, the resulting VFs are not
fully isolated from the PF. The PF can always cause a denial of service
to the VF, even if by simply resetting itself. The degree to which a PF
can access the data passed through a VF or interfere with its operation
is dependent on a given SR-IOV implementation. Therefore we want to
avoid a scenario where an existing vfio-pci based userspace driver might
assume the PF driver is trusted, for example assigning a PF to one VM
and VF to another with some expectation of isolation. IOMMU grouping
could be a solution to this, but imposes an unnecessarily strong
relationship between PF and VF drivers if they need to operate with the
same IOMMU context. Instead we introduce a "VF token", which is
essentially just a shared secret between PF and VF drivers, implemented
as a UUID.
The VF token can be set by a vfio-pci based PF driver and must be known
by the vfio-pci based VF driver in order to gain access to the device.
This allows the degree to which this VF token is considered secret to be
determined by the applications and environment. For example a VM might
generate a random UUID known only internally to the hypervisor while a
userspace networking appliance might use a shared, or even well know,
UUID among the application drivers.
To incorporate this VF token, the VFIO_GROUP_GET_DEVICE_FD interface is
extended to accept key=value pairs in addition to the device name. This
allows us to most easily deny user access to the device without risk
that existing userspace drivers assume region offsets, IRQs, and other
device features, leading to more elaborate error paths. The format of
these options are expected to take the form:
"$DEVICE_NAME $OPTION1=$VALUE1 $OPTION2=$VALUE2"
Where the device name is always provided first for compatibility and
additional options are specified in a space separated list. The
relation between and requirements for the additional options will be
vfio bus driver dependent, however unknown or unused option within this
schema should return error. This allow for future use of unknown
options as well as a positive indication to the user that an option is
used.
An example VF token option would take this form:
"0000:03:00.0 vf_token=2ab74924-c335-45f4-9b16-8569e5b08258"
When accessing a VF where the PF is making use of vfio-pci, the user
MUST provide the current vf_token. When accessing a PF, the user MUST
provide the current vf_token IF there are active VF users or MAY provide
a vf_token in order to set the current VF token when no VF users are
active. The former requirement assures VF users that an unassociated
driver cannot usurp the PF device. These semantics also imply that a
VF token MUST be set by a PF driver before VF drivers can access their
device, the default token is random and mechanisms to read the token are
not provided in order to protect the VF token of previous users. Use of
the vf_token option outside of these cases will return an error, as
discussed above.
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-03-24 09:28:27 -06:00
|
|
|
#include <linux/uuid.h>
|
2020-03-24 09:28:28 -06:00
|
|
|
#include <linux/notifier.h>
|
2012-07-31 08:16:24 -06:00
|
|
|
|
2021-08-26 13:39:01 +03:00
|
|
|
#ifndef VFIO_PCI_CORE_H
|
|
|
|
#define VFIO_PCI_CORE_H
|
2012-07-31 08:16:24 -06:00
|
|
|
|
|
|
|
#define VFIO_PCI_OFFSET_SHIFT 40
|
|
|
|
#define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
|
|
|
|
#define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
|
|
|
|
#define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
|
|
|
|
|
2021-08-26 13:39:02 +03:00
|
|
|
struct vfio_pci_core_device;
|
2016-02-22 16:02:39 -07:00
|
|
|
struct vfio_pci_region;
|
|
|
|
|
|
|
|
struct vfio_pci_regops {
|
2021-08-26 13:39:02 +03:00
|
|
|
ssize_t (*rw)(struct vfio_pci_core_device *vdev, char __user *buf,
|
2016-02-22 16:02:39 -07:00
|
|
|
size_t count, loff_t *ppos, bool iswrite);
|
2021-08-26 13:39:02 +03:00
|
|
|
void (*release)(struct vfio_pci_core_device *vdev,
|
2016-02-22 16:02:39 -07:00
|
|
|
struct vfio_pci_region *region);
|
2021-08-26 13:39:02 +03:00
|
|
|
int (*mmap)(struct vfio_pci_core_device *vdev,
|
2018-12-19 19:52:30 +11:00
|
|
|
struct vfio_pci_region *region,
|
|
|
|
struct vm_area_struct *vma);
|
2021-08-26 13:39:02 +03:00
|
|
|
int (*add_capability)(struct vfio_pci_core_device *vdev,
|
2018-12-19 19:52:31 +11:00
|
|
|
struct vfio_pci_region *region,
|
|
|
|
struct vfio_info_cap *caps);
|
2016-02-22 16:02:39 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
struct vfio_pci_region {
|
|
|
|
u32 type;
|
|
|
|
u32 subtype;
|
|
|
|
const struct vfio_pci_regops *ops;
|
|
|
|
void *data;
|
|
|
|
size_t size;
|
|
|
|
u32 flags;
|
|
|
|
};
|
|
|
|
|
2021-08-26 13:39:02 +03:00
|
|
|
struct vfio_pci_core_device {
|
2021-03-30 09:53:07 -06:00
|
|
|
struct vfio_device vdev;
|
2012-07-31 08:16:24 -06:00
|
|
|
struct pci_dev *pdev;
|
2019-09-28 02:43:08 +03:00
|
|
|
void __iomem *barmap[PCI_STD_NUM_BARS];
|
|
|
|
bool bar_mmap_supported[PCI_STD_NUM_BARS];
|
2012-07-31 08:16:24 -06:00
|
|
|
u8 *pci_config_map;
|
|
|
|
u8 *vconfig;
|
|
|
|
struct perm_bits *msi_perm;
|
|
|
|
spinlock_t irqlock;
|
|
|
|
struct mutex igate;
|
2023-05-11 08:44:32 -07:00
|
|
|
struct xarray ctx;
|
2012-07-31 08:16:24 -06:00
|
|
|
int irq_type;
|
2016-02-22 16:02:39 -07:00
|
|
|
int num_regions;
|
|
|
|
struct vfio_pci_region *region;
|
2012-07-31 08:16:24 -06:00
|
|
|
u8 msi_qmax;
|
|
|
|
u8 msix_bar;
|
|
|
|
u16 msix_size;
|
|
|
|
u32 msix_offset;
|
|
|
|
u32 rbar[7];
|
2023-05-11 08:44:36 -07:00
|
|
|
bool has_dyn_msix:1;
|
2023-05-11 08:44:35 -07:00
|
|
|
bool pci_2_3:1;
|
|
|
|
bool virq_disabled:1;
|
|
|
|
bool reset_works:1;
|
|
|
|
bool extended_caps:1;
|
|
|
|
bool bardirty:1;
|
|
|
|
bool has_vga:1;
|
|
|
|
bool needs_reset:1;
|
|
|
|
bool nointx:1;
|
|
|
|
bool needs_pm_restore:1;
|
|
|
|
bool pm_intx_masked:1;
|
|
|
|
bool pm_runtime_engaged:1;
|
2012-07-31 08:16:24 -06:00
|
|
|
struct pci_saved_state *pci_saved_state;
|
2019-02-09 13:43:30 -07:00
|
|
|
struct pci_saved_state *pm_save;
|
2018-03-21 12:46:21 -06:00
|
|
|
int ioeventfds_nr;
|
2013-03-11 09:31:22 -06:00
|
|
|
struct eventfd_ctx *err_trigger;
|
2015-02-06 15:05:08 -07:00
|
|
|
struct eventfd_ctx *req_trigger;
|
vfio/pci: Implement VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
This patch implements VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP
device feature. In the VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY, if there is
any access for the VFIO device on the host side, then the device will
be moved out of the low power state without the user's guest driver
involvement. Once the device access has been finished, then the host
can move the device again into low power state. With the low power
entry happened through VFIO_DEVICE_FEATURE_LOW_POWER_ENTRY_WITH_WAKEUP,
the device will not be moved back into the low power state and
a notification will be sent to the user by triggering wakeup eventfd.
vfio_pci_core_pm_entry() will be called for both the variants of low
power feature entry so add an extra argument for wakeup eventfd context
and store locally in 'struct vfio_pci_core_device'.
For the entry happened without wakeup eventfd, all the exit related
handling will be done by the LOW_POWER_EXIT device feature only.
When the LOW_POWER_EXIT will be called, then the vfio core layer
vfio_device_pm_runtime_get() will increment the usage count and will
resume the device. In the driver runtime_resume callback, the
'pm_wake_eventfd_ctx' will be NULL. Then vfio_pci_core_pm_exit()
will call vfio_pci_runtime_pm_exit() and all the exit related handling
will be done.
For the entry happened with wakeup eventfd, in the driver resume
callback, eventfd will be triggered and all the exit related handling will
be done. When vfio_pci_runtime_pm_exit() will be called by
vfio_pci_core_pm_exit(), then it will return early.
But if the runtime suspend has not happened on the host side, then
all the exit related handling will be done in vfio_pci_core_pm_exit()
only.
Signed-off-by: Abhishek Sahu <abhsahu@nvidia.com>
Link: https://lore.kernel.org/r/20220829114850.4341-6-abhsahu@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2022-08-29 17:18:50 +05:30
|
|
|
struct eventfd_ctx *pm_wake_eventfd_ctx;
|
2016-06-30 15:21:24 +08:00
|
|
|
struct list_head dummy_resources_list;
|
2018-03-21 12:46:21 -06:00
|
|
|
struct mutex ioeventfds_lock;
|
|
|
|
struct list_head ioeventfds_list;
|
vfio/pci: Introduce VF token
If we enable SR-IOV on a vfio-pci owned PF, the resulting VFs are not
fully isolated from the PF. The PF can always cause a denial of service
to the VF, even if by simply resetting itself. The degree to which a PF
can access the data passed through a VF or interfere with its operation
is dependent on a given SR-IOV implementation. Therefore we want to
avoid a scenario where an existing vfio-pci based userspace driver might
assume the PF driver is trusted, for example assigning a PF to one VM
and VF to another with some expectation of isolation. IOMMU grouping
could be a solution to this, but imposes an unnecessarily strong
relationship between PF and VF drivers if they need to operate with the
same IOMMU context. Instead we introduce a "VF token", which is
essentially just a shared secret between PF and VF drivers, implemented
as a UUID.
The VF token can be set by a vfio-pci based PF driver and must be known
by the vfio-pci based VF driver in order to gain access to the device.
This allows the degree to which this VF token is considered secret to be
determined by the applications and environment. For example a VM might
generate a random UUID known only internally to the hypervisor while a
userspace networking appliance might use a shared, or even well know,
UUID among the application drivers.
To incorporate this VF token, the VFIO_GROUP_GET_DEVICE_FD interface is
extended to accept key=value pairs in addition to the device name. This
allows us to most easily deny user access to the device without risk
that existing userspace drivers assume region offsets, IRQs, and other
device features, leading to more elaborate error paths. The format of
these options are expected to take the form:
"$DEVICE_NAME $OPTION1=$VALUE1 $OPTION2=$VALUE2"
Where the device name is always provided first for compatibility and
additional options are specified in a space separated list. The
relation between and requirements for the additional options will be
vfio bus driver dependent, however unknown or unused option within this
schema should return error. This allow for future use of unknown
options as well as a positive indication to the user that an option is
used.
An example VF token option would take this form:
"0000:03:00.0 vf_token=2ab74924-c335-45f4-9b16-8569e5b08258"
When accessing a VF where the PF is making use of vfio-pci, the user
MUST provide the current vf_token. When accessing a PF, the user MUST
provide the current vf_token IF there are active VF users or MAY provide
a vf_token in order to set the current VF token when no VF users are
active. The former requirement assures VF users that an unassociated
driver cannot usurp the PF device. These semantics also imply that a
VF token MUST be set by a PF driver before VF drivers can access their
device, the default token is random and mechanisms to read the token are
not provided in order to protect the VF token of previous users. Use of
the vf_token option outside of these cases will return an error, as
discussed above.
Reviewed-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-03-24 09:28:27 -06:00
|
|
|
struct vfio_pci_vf_token *vf_token;
|
2022-04-13 10:10:36 -03:00
|
|
|
struct list_head sriov_pfs_item;
|
|
|
|
struct vfio_pci_core_device *sriov_pf_core_dev;
|
2020-03-24 09:28:28 -06:00
|
|
|
struct notifier_block nb;
|
2020-04-28 13:12:20 -06:00
|
|
|
struct mutex vma_lock;
|
|
|
|
struct list_head vma_list;
|
vfio-pci: Invalidate mmaps and block MMIO access on disabled memory
Accessing the disabled memory space of a PCI device would typically
result in a master abort response on conventional PCI, or an
unsupported request on PCI express. The user would generally see
these as a -1 response for the read return data and the write would be
silently discarded, possibly with an uncorrected, non-fatal AER error
triggered on the host. Some systems however take it upon themselves
to bring down the entire system when they see something that might
indicate a loss of data, such as this discarded write to a disabled
memory space.
To avoid this, we want to try to block the user from accessing memory
spaces while they're disabled. We start with a semaphore around the
memory enable bit, where writers modify the memory enable state and
must be serialized, while readers make use of the memory region and
can access in parallel. Writers include both direct manipulation via
the command register, as well as any reset path where the internal
mechanics of the reset may both explicitly and implicitly disable
memory access, and manipulation of the MSI-X configuration, where the
MSI-X vector table resides in MMIO space of the device. Readers
include the read and write file ops to access the vfio device fd
offsets as well as memory mapped access. In the latter case, we make
use of our new vma list support to zap, or invalidate, those memory
mappings in order to force them to be faulted back in on access.
Our semaphore usage will stall user access to MMIO spaces across
internal operations like reset, but the user might experience new
behavior when trying to access the MMIO space while disabled via the
PCI command register. Access via read or write while disabled will
return -EIO and access via memory maps will result in a SIGBUS. This
is expected to be compatible with known use cases and potentially
provides better error handling capabilities than present in the
hardware, while avoiding the more readily accessible and severe
platform error responses that might otherwise occur.
Fixes: CVE-2020-12888
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
2020-04-22 13:48:11 -06:00
|
|
|
struct rw_semaphore memory_lock;
|
2012-07-31 08:16:24 -06:00
|
|
|
};
|
|
|
|
|
2022-08-26 16:34:01 -03:00
|
|
|
/* Will be exported for vfio pci drivers usage */
|
2022-08-26 16:34:02 -03:00
|
|
|
int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev,
|
|
|
|
unsigned int type, unsigned int subtype,
|
|
|
|
const struct vfio_pci_regops *ops,
|
|
|
|
size_t size, u32 flags, void *data);
|
2021-08-26 13:39:07 +03:00
|
|
|
void vfio_pci_core_set_params(bool nointxmask, bool is_disable_vga,
|
|
|
|
bool is_disable_idle_d3);
|
2021-08-26 13:39:05 +03:00
|
|
|
void vfio_pci_core_close_device(struct vfio_device *core_vdev);
|
2022-09-21 18:43:48 +08:00
|
|
|
int vfio_pci_core_init_dev(struct vfio_device *core_vdev);
|
|
|
|
void vfio_pci_core_release_dev(struct vfio_device *core_vdev);
|
2021-08-26 13:39:05 +03:00
|
|
|
int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev);
|
|
|
|
void vfio_pci_core_unregister_device(struct vfio_pci_core_device *vdev);
|
|
|
|
extern const struct pci_error_handlers vfio_pci_core_err_handlers;
|
2022-05-11 13:19:07 -06:00
|
|
|
int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev,
|
|
|
|
int nr_virtfn);
|
2021-08-26 13:39:05 +03:00
|
|
|
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
|
|
|
|
unsigned long arg);
|
2022-02-24 16:20:17 +02:00
|
|
|
int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
|
|
|
|
void __user *arg, size_t argsz);
|
2021-08-26 13:39:05 +03:00
|
|
|
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
|
|
|
|
size_t count, loff_t *ppos);
|
|
|
|
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
|
|
|
|
size_t count, loff_t *ppos);
|
|
|
|
int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
|
|
|
|
void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
|
|
|
|
int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
|
2021-08-26 13:39:06 +03:00
|
|
|
int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
|
|
|
|
void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
|
|
|
|
void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
|
2023-12-19 11:32:45 +02:00
|
|
|
int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
|
2022-02-24 16:20:23 +02:00
|
|
|
pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
|
|
|
|
pci_channel_state_t state);
|
2021-08-26 13:39:06 +03:00
|
|
|
|
2023-12-19 11:32:46 +02:00
|
|
|
#define VFIO_IOWRITE_DECLATION(size) \
|
|
|
|
int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev, \
|
|
|
|
bool test_mem, u##size val, void __iomem *io);
|
|
|
|
|
|
|
|
VFIO_IOWRITE_DECLATION(8)
|
|
|
|
VFIO_IOWRITE_DECLATION(16)
|
|
|
|
VFIO_IOWRITE_DECLATION(32)
|
|
|
|
#ifdef iowrite64
|
|
|
|
VFIO_IOWRITE_DECLATION(64)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define VFIO_IOREAD_DECLATION(size) \
|
|
|
|
int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \
|
|
|
|
bool test_mem, u##size *val, void __iomem *io);
|
|
|
|
|
|
|
|
VFIO_IOREAD_DECLATION(8)
|
|
|
|
VFIO_IOREAD_DECLATION(16)
|
|
|
|
VFIO_IOREAD_DECLATION(32)
|
|
|
|
|
2021-08-26 13:39:01 +03:00
|
|
|
#endif /* VFIO_PCI_CORE_H */
|