VFIO updates for v6.7

- Add support for "chunk mode" in the mlx5-vfio-pci variant driver,
    which allows both larger device image sizes for migration, beyond
    the previous 4GB limit, and also read-ahead support for improved
    migration performance. (Yishai Hadas)
 
  - A new bus master control interface for the CDX bus driver where
    there is no in-band mechanism to toggle device DMA as there is
    through config space on PCI devices. (Nipun Gupta)
 
  - Add explicit alignment directives to vfio data structures to
    reduce the chance of breaking 32-bit userspace.  In most cases
    this is transparent and the remaining cases where data structures
    are padded work within the existing rules for extending data
    structures within vfio.  (Stefan Hajnoczi)
 
  - Resolve a bug in the cdx bus driver noted when compiled with clang
    where missing parenthesis result in the wrong operation.
    (Nathan Chancellor)
 
  - Resolve errors reported by smatch for a function when dealing
    with invalid inputs. (Alex Williamson)
 
  - Add migration support to the mtty vfio/mdev sample driver for
    testing and integration purposes, allowing CI of migration without
    specific hardware requirements.  Also resolve many of the short-
    comings of this driver relative to implementation of the vfio
    interrupt ioctl along the way. (Alex Williamson)
 -----BEGIN PGP SIGNATURE-----
 
 iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmVALFYbHGFsZXgud2ls
 bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsigekP/3IbfI8XPUheSE/4f790
 TAaot7GBvXs7pBGF0ewtVGn/Lo1p3gWDdbGoJNWNoHGUnzCHBkewpCA0cd2xmn4Y
 fJTax1WbBdagmMQ7gGFC2x06JgWHZqSX43rD4ZhQDVgv4LhxBJ/eTAdEZocFIKTL
 a7F/2VH7EaGnybOCOwBDMA8lBr3DJ2eiU25kBYCLRzT61GIwlkurYI7TIX2AcHmW
 iWA7OwJ8YDzVd6IGBad003hDm0MRmkgGvdSR8rHj0Bnz7tSS6UcsNGu0OsUzJdxF
 h6SCaK2w8eKjVaU/OmRZrteq1lwryUbuY1Gdceazj0fcvsMnP19k3nWadSZ2+/mv
 fqC60ahjfN0cIwVeV1GHcGeJI2ImNmCO/Uup7Y+bH9FSc5TQuKK90FRlYmHveUY7
 XEkmi8rcpwEvcrOHT8uZxSRyi6lbjo7i88DWPJ1ByyM82l1rg2ktNI1KEOsvQOdV
 QteFdD3e8mHQYwTAI4MbhWPRvJsN+RM05UKrTvpFHwa4eUqfaJQ0SJktbZIlm/XB
 8UFPF+h6iUFuqJwAH4zlEvDyjMZlAkp9redlKklyGPUreW8MOo/1yoOl7Q5dgibj
 +ZTy7IJjrGXrTpeL1QohBl3BHrGMM2O3rYubkXKZIOU38MdXXn4xzDY+w+fNwB5l
 c/oIN2i1TIUr4Ub0BfI7Fxk7
 =3kXB
 -----END PGP SIGNATURE-----

Merge tag 'vfio-v6.7-rc1' of https://github.com/awilliam/linux-vfio

Pull VFIO updates from Alex Williamson:

 - Add support for "chunk mode" in the mlx5-vfio-pci variant driver,
   which allows both larger device image sizes for migration, beyond the
   previous 4GB limit, and also read-ahead support for improved
   migration performance (Yishai Hadas)

 - A new bus master control interface for the CDX bus driver where there
   is no in-band mechanism to toggle device DMA as there is through
   config space on PCI devices (Nipun Gupta)

 - Add explicit alignment directives to vfio data structures to reduce
   the chance of breaking 32-bit userspace. In most cases this is
   transparent and the remaining cases where data structures are padded
   work within the existing rules for extending data structures within
   vfio (Stefan Hajnoczi)

 - Resolve a bug in the cdx bus driver noted when compiled with clang
   where missing parenthesis result in the wrong operation (Nathan
   Chancellor)

 - Resolve errors reported by smatch for a function when dealing with
   invalid inputs (Alex Williamson)

 - Add migration support to the mtty vfio/mdev sample driver for testing
   and integration purposes, allowing CI of migration without specific
   hardware requirements. Also resolve many of the short- comings of
   this driver relative to implementation of the vfio interrupt ioctl
   along the way (Alex Williamson)

* tag 'vfio-v6.7-rc1' of https://github.com/awilliam/linux-vfio:
  vfio/mtty: Enable migration support
  vfio/mtty: Overhaul mtty interrupt handling
  vfio: Fix smatch errors in vfio_combine_iova_ranges()
  vfio/cdx: Add parentheses between bitwise AND expression and logical NOT
  vfio/mlx5: Activate the chunk mode functionality
  vfio/mlx5: Add support for READING in chunk mode
  vfio/mlx5: Add support for SAVING in chunk mode
  vfio/mlx5: Pre-allocate chunks for the STOP_COPY phase
  vfio/mlx5: Rename some stuff to match chunk mode
  vfio/mlx5: Enable querying state size which is > 4GB
  vfio/mlx5: Refactor the SAVE callback to activate a work only upon an error
  vfio/mlx5: Wake up the reader post of disabling the SAVING migration file
  vfio: use __aligned_u64 in struct vfio_device_ioeventfd
  vfio: use __aligned_u64 in struct vfio_device_gfx_plane_info
  vfio: trivially use __aligned_u64 for ioctl structs
  vfio-cdx: add bus mastering device feature support
  vfio: add bus master feature to device feature ioctl
  cdx: add support for bus mastering
This commit is contained in:
Linus Torvalds 2023-11-01 13:55:40 -10:00
commit deefd5024f
16 changed files with 1298 additions and 192 deletions

View File

@ -182,6 +182,38 @@ cdx_match_id(const struct cdx_device_id *ids, struct cdx_device *dev)
return NULL;
}
int cdx_set_master(struct cdx_device *cdx_dev)
{
struct cdx_controller *cdx = cdx_dev->cdx;
struct cdx_device_config dev_config;
int ret = -EOPNOTSUPP;
dev_config.type = CDX_DEV_BUS_MASTER_CONF;
dev_config.bus_master_enable = true;
if (cdx->ops->dev_configure)
ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
cdx_dev->dev_num, &dev_config);
return ret;
}
EXPORT_SYMBOL_GPL(cdx_set_master);
int cdx_clear_master(struct cdx_device *cdx_dev)
{
struct cdx_controller *cdx = cdx_dev->cdx;
struct cdx_device_config dev_config;
int ret = -EOPNOTSUPP;
dev_config.type = CDX_DEV_BUS_MASTER_CONF;
dev_config.bus_master_enable = false;
if (cdx->ops->dev_configure)
ret = cdx->ops->dev_configure(cdx, cdx_dev->bus_num,
cdx_dev->dev_num, &dev_config);
return ret;
}
EXPORT_SYMBOL_GPL(cdx_clear_master);
/**
* cdx_bus_match - device to driver matching callback
* @dev: the cdx device to match against

View File

@ -56,6 +56,10 @@ static int cdx_configure_device(struct cdx_controller *cdx,
case CDX_DEV_RESET_CONF:
ret = cdx_mcdi_reset_device(cdx->priv, bus_num, dev_num);
break;
case CDX_DEV_BUS_MASTER_CONF:
ret = cdx_mcdi_bus_master_enable(cdx->priv, bus_num, dev_num,
dev_config->bus_master_enable);
break;
default:
ret = -EINVAL;
}

View File

@ -137,3 +137,61 @@ int cdx_mcdi_reset_device(struct cdx_mcdi *cdx, u8 bus_num, u8 dev_num)
return ret;
}
static int cdx_mcdi_ctrl_flag_get(struct cdx_mcdi *cdx, u8 bus_num,
u8 dev_num, u32 *flags)
{
MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_IN_LEN);
MCDI_DECLARE_BUF(outbuf, MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN);
size_t outlen;
int ret;
MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_BUS, bus_num);
MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_GET_IN_DEVICE, dev_num);
ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_GET, inbuf,
sizeof(inbuf), outbuf, sizeof(outbuf), &outlen);
if (ret)
return ret;
if (outlen != MC_CMD_CDX_DEVICE_CONTROL_GET_OUT_LEN)
return -EIO;
*flags = MCDI_DWORD(outbuf, CDX_DEVICE_CONTROL_GET_OUT_FLAGS);
return 0;
}
static int cdx_mcdi_ctrl_flag_set(struct cdx_mcdi *cdx, u8 bus_num,
u8 dev_num, bool enable, int bit_pos)
{
MCDI_DECLARE_BUF(inbuf, MC_CMD_CDX_DEVICE_CONTROL_SET_IN_LEN);
u32 flags;
int ret;
/*
* Get flags and then set/reset bit at bit_pos according to
* the input params.
*/
ret = cdx_mcdi_ctrl_flag_get(cdx, bus_num, dev_num, &flags);
if (ret)
return ret;
flags = flags & (u32)(~(BIT(bit_pos)));
if (enable)
flags |= (1 << bit_pos);
MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_BUS, bus_num);
MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_DEVICE, dev_num);
MCDI_SET_DWORD(inbuf, CDX_DEVICE_CONTROL_SET_IN_FLAGS, flags);
ret = cdx_mcdi_rpc(cdx, MC_CMD_CDX_DEVICE_CONTROL_SET, inbuf,
sizeof(inbuf), NULL, 0, NULL);
return ret;
}
int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
u8 dev_num, bool enable)
{
return cdx_mcdi_ctrl_flag_set(cdx, bus_num, dev_num, enable,
MC_CMD_CDX_DEVICE_CONTROL_SET_IN_BUS_MASTER_ENABLE_LBN);
}

View File

@ -58,4 +58,17 @@ int cdx_mcdi_get_dev_config(struct cdx_mcdi *cdx,
int cdx_mcdi_reset_device(struct cdx_mcdi *cdx,
u8 bus_num, u8 dev_num);
/**
* cdx_mcdi_bus_master_enable - Set/Reset bus mastering for cdx device
* represented by bus_num:dev_num
* @cdx: pointer to MCDI interface.
* @bus_num: Bus number.
* @dev_num: Device number.
* @enable: Enable bus mastering if set, disable otherwise.
*
* Return: 0 on success, <0 on failure
*/
int cdx_mcdi_bus_master_enable(struct cdx_mcdi *cdx, u8 bus_num,
u8 dev_num, bool enable);
#endif /* CDX_MCDI_FUNCTIONS_H */

View File

@ -1379,7 +1379,7 @@ static long intel_vgpu_ioctl(struct vfio_device *vfio_dev, unsigned int cmd,
intel_gvt_reset_vgpu(vgpu);
return 0;
} else if (cmd == VFIO_DEVICE_QUERY_GFX_PLANE) {
struct vfio_device_gfx_plane_info dmabuf;
struct vfio_device_gfx_plane_info dmabuf = {};
int ret = 0;
minsz = offsetofend(struct vfio_device_gfx_plane_info,

View File

@ -14,7 +14,7 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
container_of(core_vdev, struct vfio_cdx_device, vdev);
struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
int count = cdx_dev->res_count;
int i;
int i, ret;
vdev->regions = kcalloc(count, sizeof(struct vfio_cdx_region),
GFP_KERNEL_ACCOUNT);
@ -39,6 +39,17 @@ static int vfio_cdx_open_device(struct vfio_device *core_vdev)
if (!(cdx_dev->res[i].flags & IORESOURCE_READONLY))
vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_WRITE;
}
ret = cdx_dev_reset(core_vdev->dev);
if (ret) {
kfree(vdev->regions);
vdev->regions = NULL;
return ret;
}
ret = cdx_clear_master(cdx_dev);
if (ret)
vdev->flags &= ~BME_SUPPORT;
else
vdev->flags |= BME_SUPPORT;
return 0;
}
@ -52,6 +63,49 @@ static void vfio_cdx_close_device(struct vfio_device *core_vdev)
cdx_dev_reset(core_vdev->dev);
}
static int vfio_cdx_bm_ctrl(struct vfio_device *core_vdev, u32 flags,
void __user *arg, size_t argsz)
{
size_t minsz =
offsetofend(struct vfio_device_feature_bus_master, op);
struct vfio_cdx_device *vdev =
container_of(core_vdev, struct vfio_cdx_device, vdev);
struct cdx_device *cdx_dev = to_cdx_device(core_vdev->dev);
struct vfio_device_feature_bus_master ops;
int ret;
if (!(vdev->flags & BME_SUPPORT))
return -ENOTTY;
ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET,
sizeof(ops));
if (ret != 1)
return ret;
if (copy_from_user(&ops, arg, minsz))
return -EFAULT;
switch (ops.op) {
case VFIO_DEVICE_FEATURE_CLEAR_MASTER:
return cdx_clear_master(cdx_dev);
case VFIO_DEVICE_FEATURE_SET_MASTER:
return cdx_set_master(cdx_dev);
default:
return -EINVAL;
}
}
static int vfio_cdx_ioctl_feature(struct vfio_device *device, u32 flags,
void __user *arg, size_t argsz)
{
switch (flags & VFIO_DEVICE_FEATURE_MASK) {
case VFIO_DEVICE_FEATURE_BUS_MASTER:
return vfio_cdx_bm_ctrl(device, flags, arg, argsz);
default:
return -ENOTTY;
}
}
static int vfio_cdx_ioctl_get_info(struct vfio_cdx_device *vdev,
struct vfio_device_info __user *arg)
{
@ -169,6 +223,7 @@ static const struct vfio_device_ops vfio_cdx_ops = {
.open_device = vfio_cdx_open_device,
.close_device = vfio_cdx_close_device,
.ioctl = vfio_cdx_ioctl,
.device_feature = vfio_cdx_ioctl_feature,
.mmap = vfio_cdx_mmap,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,

View File

@ -23,6 +23,8 @@ struct vfio_cdx_region {
struct vfio_cdx_device {
struct vfio_device vdev;
struct vfio_cdx_region *regions;
u32 flags;
#define BME_SUPPORT BIT(0)
};
#endif /* VFIO_CDX_PRIVATE_H */

View File

@ -86,7 +86,8 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
}
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
size_t *state_size, u8 query_flags)
size_t *state_size, u64 *total_size,
u8 query_flags)
{
u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
@ -128,6 +129,7 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
MLX5_SET(query_vhca_migration_state_in, in, incremental,
query_flags & MLX5VF_QUERY_INC);
MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
out);
@ -139,6 +141,11 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
*state_size = MLX5_GET(query_vhca_migration_state_out, out,
required_umem_size);
if (total_size)
*total_size = mvdev->chunk_mode ?
MLX5_GET64(query_vhca_migration_state_out, out,
remaining_total_size) : *state_size;
return 0;
}
@ -254,6 +261,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
mvdev->core_device.vdev.migration_flags |=
VFIO_MIGRATION_PRE_COPY;
if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
mvdev->chunk_mode = 1;
end:
mlx5_vf_put_core_dev(mvdev->mdev);
}
@ -428,6 +438,7 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
{
spin_lock_irq(&buf->migf->list_lock);
buf->stop_copy_chunk_num = 0;
list_add_tail(&buf->buf_elm, &buf->migf->avail_list);
spin_unlock_irq(&buf->migf->list_lock);
}
@ -475,6 +486,15 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
return buf;
}
static void
mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
struct mlx5vf_async_data *async_data)
{
kvfree(async_data->out);
complete(&migf->save_comp);
fput(migf->filp);
}
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
{
struct mlx5vf_async_data *async_data = container_of(_work,
@ -487,16 +507,15 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
mlx5vf_put_data_buffer(async_data->buf);
if (async_data->header_buf)
mlx5vf_put_data_buffer(async_data->header_buf);
if (async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
if (!async_data->stop_copy_chunk &&
async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
else
migf->state = MLX5_MIGF_STATE_ERROR;
wake_up_interruptible(&migf->poll_wait);
}
mutex_unlock(&migf->lock);
kvfree(async_data->out);
complete(&migf->save_comp);
fput(migf->filp);
mlx5vf_save_callback_complete(migf, async_data);
}
static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
@ -536,13 +555,20 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
struct mlx5_vf_migration_file, async_data);
if (!status) {
size_t next_required_umem_size = 0;
bool stop_copy_last_chunk;
size_t image_size;
unsigned long flags;
bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
!async_data->last_chunk;
!async_data->stop_copy_chunk;
image_size = MLX5_GET(save_vhca_state_out, async_data->out,
actual_image_size);
if (async_data->buf->stop_copy_chunk_num)
next_required_umem_size = MLX5_GET(save_vhca_state_out,
async_data->out, next_required_umem_size);
stop_copy_last_chunk = async_data->stop_copy_chunk &&
!next_required_umem_size;
if (async_data->header_buf) {
status = add_buf_header(async_data->header_buf, image_size,
initial_pre_copy);
@ -554,19 +580,34 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
migf->max_pos += async_data->buf->length;
spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&async_data->buf->buf_elm, &migf->buf_list);
if (async_data->buf->stop_copy_chunk_num) {
migf->num_ready_chunks++;
if (next_required_umem_size &&
migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
/* Delay the next SAVE till one chunk be consumed */
migf->next_required_umem_size = next_required_umem_size;
next_required_umem_size = 0;
}
}
spin_unlock_irqrestore(&migf->list_lock, flags);
if (initial_pre_copy)
if (initial_pre_copy) {
migf->pre_copy_initial_bytes += image_size;
migf->state = async_data->last_chunk ?
MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY;
migf->state = MLX5_MIGF_STATE_PRE_COPY;
}
if (stop_copy_last_chunk)
migf->state = MLX5_MIGF_STATE_COMPLETE;
wake_up_interruptible(&migf->poll_wait);
if (next_required_umem_size)
mlx5vf_mig_file_set_save_work(migf,
/* Picking up the next chunk num */
(async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
next_required_umem_size);
mlx5vf_save_callback_complete(migf, async_data);
return;
}
err:
/*
* The error and the cleanup flows can't run from an
* interrupt context
*/
/* The error flow can't run from an interrupt context */
if (status == -EREMOTEIO)
status = MLX5_GET(save_vhca_state_out, async_data->out, status);
async_data->status = status;
@ -610,7 +651,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
async_data = &migf->async_data;
async_data->buf = buf;
async_data->last_chunk = !track;
async_data->stop_copy_chunk = !track;
async_data->out = kvzalloc(out_size, GFP_KERNEL);
if (!async_data->out) {
err = -ENOMEM;
@ -618,10 +659,15 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
if (async_data->last_chunk && migf->buf_header) {
header_buf = migf->buf_header;
migf->buf_header = NULL;
} else {
if (async_data->stop_copy_chunk) {
u8 header_idx = buf->stop_copy_chunk_num ?
buf->stop_copy_chunk_num - 1 : 0;
header_buf = migf->buf_header[header_idx];
migf->buf_header[header_idx] = NULL;
}
if (!header_buf) {
header_buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(header_buf)) {
@ -631,8 +677,8 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
}
if (async_data->last_chunk)
migf->state = MLX5_MIGF_STATE_SAVE_LAST;
if (async_data->stop_copy_chunk)
migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
async_data->header_buf = header_buf;
get_file(migf->filp);
@ -707,18 +753,21 @@ void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
{
struct mlx5_vhca_data_buffer *entry;
int i;
lockdep_assert_held(&migf->mvdev->state_mutex);
WARN_ON(migf->mvdev->mdev_detach);
if (migf->buf) {
mlx5vf_free_data_buffer(migf->buf);
migf->buf = NULL;
}
for (i = 0; i < MAX_NUM_CHUNKS; i++) {
if (migf->buf[i]) {
mlx5vf_free_data_buffer(migf->buf[i]);
migf->buf[i] = NULL;
}
if (migf->buf_header) {
mlx5vf_free_data_buffer(migf->buf_header);
migf->buf_header = NULL;
if (migf->buf_header[i]) {
mlx5vf_free_data_buffer(migf->buf_header[i]);
migf->buf_header[i] = NULL;
}
}
list_splice(&migf->avail_list, &migf->buf_list);

View File

@ -20,7 +20,7 @@ enum mlx5_vf_migf_state {
MLX5_MIGF_STATE_ERROR = 1,
MLX5_MIGF_STATE_PRE_COPY_ERROR,
MLX5_MIGF_STATE_PRE_COPY,
MLX5_MIGF_STATE_SAVE_LAST,
MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK,
MLX5_MIGF_STATE_COMPLETE,
};
@ -64,6 +64,7 @@ struct mlx5_vhca_data_buffer {
u32 mkey;
enum dma_data_direction dma_dir;
u8 dmaed:1;
u8 stop_copy_chunk_num;
struct list_head buf_elm;
struct mlx5_vf_migration_file *migf;
/* Optimize mlx5vf_get_migration_page() for sequential access */
@ -78,10 +79,19 @@ struct mlx5vf_async_data {
struct mlx5_vhca_data_buffer *buf;
struct mlx5_vhca_data_buffer *header_buf;
int status;
u8 last_chunk:1;
u8 stop_copy_chunk:1;
void *out;
};
struct mlx5vf_save_work_data {
struct mlx5_vf_migration_file *migf;
size_t next_required_umem_size;
struct work_struct work;
u8 chunk_num;
};
#define MAX_NUM_CHUNKS 2
struct mlx5_vf_migration_file {
struct file *filp;
struct mutex lock;
@ -94,8 +104,12 @@ struct mlx5_vf_migration_file {
u32 record_tag;
u64 stop_copy_prep_size;
u64 pre_copy_initial_bytes;
struct mlx5_vhca_data_buffer *buf;
struct mlx5_vhca_data_buffer *buf_header;
size_t next_required_umem_size;
u8 num_ready_chunks;
/* Upon chunk mode preserve another set of buffers for stop_copy phase */
struct mlx5_vhca_data_buffer *buf[MAX_NUM_CHUNKS];
struct mlx5_vhca_data_buffer *buf_header[MAX_NUM_CHUNKS];
struct mlx5vf_save_work_data save_data[MAX_NUM_CHUNKS];
spinlock_t list_lock;
struct list_head buf_list;
struct list_head avail_list;
@ -164,6 +178,7 @@ struct mlx5vf_pci_core_device {
u8 deferred_reset:1;
u8 mdev_detach:1;
u8 log_active:1;
u8 chunk_mode:1;
struct completion tracker_comp;
/* protect migration state */
struct mutex state_mutex;
@ -186,7 +201,8 @@ enum {
int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod);
int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
size_t *state_size, u8 query_flags);
size_t *state_size, u64 *total_size,
u8 query_flags);
void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
const struct vfio_migration_ops *mig_ops,
const struct vfio_log_ops *log_ops);
@ -217,6 +233,8 @@ struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work);
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
u8 chunk_num, size_t next_required_umem_size);
int mlx5vf_start_page_tracker(struct vfio_device *vdev,
struct rb_root_cached *ranges, u32 nnodes, u64 *page_size);
int mlx5vf_stop_page_tracker(struct vfio_device *vdev);

View File

@ -24,6 +24,8 @@
/* Device specification max LOAD size */
#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
#define MAX_CHUNK_SIZE SZ_8M
static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
{
struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
@ -158,6 +160,41 @@ mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
return found ? buf : NULL;
}
static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
{
struct mlx5_vf_migration_file *migf = vhca_buf->migf;
if (vhca_buf->stop_copy_chunk_num) {
bool is_header = vhca_buf->dma_dir == DMA_NONE;
u8 chunk_num = vhca_buf->stop_copy_chunk_num;
size_t next_required_umem_size = 0;
if (is_header)
migf->buf_header[chunk_num - 1] = vhca_buf;
else
migf->buf[chunk_num - 1] = vhca_buf;
spin_lock_irq(&migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
if (!is_header) {
next_required_umem_size =
migf->next_required_umem_size;
migf->next_required_umem_size = 0;
migf->num_ready_chunks--;
}
spin_unlock_irq(&migf->list_lock);
if (next_required_umem_size)
mlx5vf_mig_file_set_save_work(migf, chunk_num,
next_required_umem_size);
return;
}
spin_lock_irq(&migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
spin_unlock_irq(&migf->list_lock);
}
static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
char __user **buf, size_t *len, loff_t *pos)
{
@ -193,12 +230,8 @@ static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
copy_len -= page_len;
}
if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
spin_lock_irq(&vhca_buf->migf->list_lock);
list_del_init(&vhca_buf->buf_elm);
list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
spin_unlock_irq(&vhca_buf->migf->list_lock);
}
if (*pos >= vhca_buf->start_pos + vhca_buf->length)
mlx5vf_buf_read_done(vhca_buf);
return done;
}
@ -304,7 +337,75 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
wake_up_interruptible(&migf->poll_wait);
}
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
u8 chunk_num, size_t next_required_umem_size)
{
migf->save_data[chunk_num - 1].next_required_umem_size =
next_required_umem_size;
migf->save_data[chunk_num - 1].migf = migf;
get_file(migf->filp);
queue_work(migf->mvdev->cb_wq,
&migf->save_data[chunk_num - 1].work);
}
static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
u8 index, size_t required_length)
{
struct mlx5_vhca_data_buffer *buf = migf->buf[index];
u8 chunk_num;
WARN_ON(!buf);
chunk_num = buf->stop_copy_chunk_num;
buf->migf->buf[index] = NULL;
/* Checking whether the pre-allocated buffer can fit */
if (buf->allocated_length >= required_length)
return buf;
mlx5vf_put_data_buffer(buf);
buf = mlx5vf_get_data_buffer(buf->migf, required_length,
DMA_FROM_DEVICE);
if (IS_ERR(buf))
return buf;
buf->stop_copy_chunk_num = chunk_num;
return buf;
}
static void mlx5vf_mig_file_save_work(struct work_struct *_work)
{
struct mlx5vf_save_work_data *save_data = container_of(_work,
struct mlx5vf_save_work_data, work);
struct mlx5_vf_migration_file *migf = save_data->migf;
struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
struct mlx5_vhca_data_buffer *buf;
mutex_lock(&mvdev->state_mutex);
if (migf->state == MLX5_MIGF_STATE_ERROR)
goto end;
buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
save_data->chunk_num - 1,
save_data->next_required_umem_size);
if (IS_ERR(buf))
goto err;
if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false))
goto err_save;
goto end;
err_save:
mlx5vf_put_data_buffer(buf);
err:
mlx5vf_mark_err(migf);
end:
mlx5vf_state_mutex_unlock(mvdev);
fput(migf->filp);
}
static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
bool track)
{
size_t size = sizeof(struct mlx5_vf_migration_header) +
sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
@ -331,7 +432,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header);
data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data);
kunmap_local(to_buff);
@ -340,48 +441,86 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
spin_lock_irqsave(&migf->list_lock, flags);
list_add_tail(&header_buf->buf_elm, &migf->buf_list);
spin_unlock_irqrestore(&migf->list_lock, flags);
migf->pre_copy_initial_bytes = size;
if (track)
migf->pre_copy_initial_bytes = size;
return 0;
err:
mlx5vf_put_data_buffer(header_buf);
return ret;
}
static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
size_t state_size)
static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
struct mlx5_vf_migration_file *migf,
size_t state_size, u64 full_size,
bool track)
{
struct mlx5_vhca_data_buffer *buf;
size_t inc_state_size;
int num_chunks;
int ret;
int i;
/* let's be ready for stop_copy size that might grow by 10 percents */
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
inc_state_size = state_size;
if (mvdev->chunk_mode) {
size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
if (IS_ERR(buf))
return PTR_ERR(buf);
migf->buf = buf;
buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
/* from firmware perspective at least 'state_size' buffer should be set */
inc_state_size = max(state_size, chunk_size);
} else {
if (track) {
/* let's be ready for stop_copy size that might grow by 10 percents */
if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
inc_state_size = state_size;
} else {
inc_state_size = state_size;
}
}
migf->buf_header = buf;
ret = mlx5vf_add_stop_copy_header(migf);
/* let's not overflow the device specification max SAVE size */
inc_state_size = min_t(size_t, inc_state_size,
(BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
for (i = 0; i < num_chunks; i++) {
buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf[i] = buf;
buf = mlx5vf_get_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf_header[i] = buf;
if (mvdev->chunk_mode) {
migf->buf[i]->stop_copy_chunk_num = i + 1;
migf->buf_header[i]->stop_copy_chunk_num = i + 1;
INIT_WORK(&migf->save_data[i].work,
mlx5vf_mig_file_save_work);
migf->save_data[i].chunk_num = i + 1;
}
}
ret = mlx5vf_add_stop_copy_header(migf, track);
if (ret)
goto err_header;
goto err;
return 0;
err_header:
mlx5vf_put_data_buffer(migf->buf_header);
migf->buf_header = NULL;
err:
mlx5vf_put_data_buffer(migf->buf);
migf->buf = NULL;
for (i = 0; i < num_chunks; i++) {
if (migf->buf[i]) {
mlx5vf_put_data_buffer(migf->buf[i]);
migf->buf[i] = NULL;
}
if (migf->buf_header[i]) {
mlx5vf_put_data_buffer(migf->buf_header[i]);
migf->buf_header[i] = NULL;
}
}
return ret;
}
@ -428,7 +567,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
* As so, the other code below is safe with the proper locks.
*/
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
MLX5VF_QUERY_INC);
NULL, MLX5VF_QUERY_INC);
if (ret)
goto err_state_unlock;
}
@ -505,21 +644,15 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
if (migf->state == MLX5_MIGF_STATE_ERROR)
return -ENODEV;
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL,
MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
if (ret)
goto err;
/* Checking whether we have a matching pre-allocated buffer that can fit */
if (migf->buf && migf->buf->allocated_length >= length) {
buf = migf->buf;
migf->buf = NULL;
} else {
buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
buf = mlx5vf_mig_file_get_stop_copy_buf(migf, 0, length);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
@ -541,6 +674,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
struct mlx5_vf_migration_file *migf;
struct mlx5_vhca_data_buffer *buf;
size_t length;
u64 full_size;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
@ -574,20 +708,25 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
INIT_LIST_HEAD(&migf->buf_list);
INIT_LIST_HEAD(&migf->avail_list);
spin_lock_init(&migf->list_lock);
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0);
if (ret)
goto out_pd;
ret = mlx5vf_prep_stop_copy(mvdev, migf, length, full_size, track);
if (ret)
goto out_pd;
if (track) {
ret = mlx5vf_prep_stop_copy(migf, length);
if (ret)
/* leave the allocated buffer ready for the stop-copy phase */
buf = mlx5vf_alloc_data_buffer(migf,
migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd;
}
buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd;
}
} else {
buf = migf->buf[0];
migf->buf[0] = NULL;
}
ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
@ -820,8 +959,8 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct mlx5_vf_migration_file *migf = filp->private_data;
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
loff_t requested_length;
bool has_work = false;
ssize_t done = 0;
@ -856,15 +995,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf_header->allocated_length < migf->record_size) {
mlx5vf_free_data_buffer(vhca_buf_header);
migf->buf_header = mlx5vf_alloc_data_buffer(migf,
migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
migf->record_size, DMA_NONE);
if (IS_ERR(migf->buf_header)) {
ret = PTR_ERR(migf->buf_header);
migf->buf_header = NULL;
if (IS_ERR(migf->buf_header[0])) {
ret = PTR_ERR(migf->buf_header[0]);
migf->buf_header[0] = NULL;
goto out_unlock;
}
vhca_buf_header = migf->buf_header;
vhca_buf_header = migf->buf_header[0];
}
vhca_buf_header->start_pos = migf->max_pos;
@ -884,15 +1023,15 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
if (vhca_buf->allocated_length < size) {
mlx5vf_free_data_buffer(vhca_buf);
migf->buf = mlx5vf_alloc_data_buffer(migf,
migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
size, DMA_TO_DEVICE);
if (IS_ERR(migf->buf)) {
ret = PTR_ERR(migf->buf);
migf->buf = NULL;
if (IS_ERR(migf->buf[0])) {
ret = PTR_ERR(migf->buf[0]);
migf->buf[0] = NULL;
goto out_unlock;
}
vhca_buf = migf->buf;
vhca_buf = migf->buf[0];
}
vhca_buf->start_pos = migf->max_pos;
@ -974,7 +1113,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_pd;
}
migf->buf = buf;
migf->buf[0] = buf;
if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
buf = mlx5vf_alloc_data_buffer(migf,
sizeof(struct mlx5_vf_migration_header), DMA_NONE);
@ -983,7 +1122,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
goto out_buf;
}
migf->buf_header = buf;
migf->buf_header[0] = buf;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
} else {
/* Initial state will be to read the image */
@ -997,7 +1136,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
spin_lock_init(&migf->list_lock);
return migf;
out_buf:
mlx5vf_free_data_buffer(migf->buf);
mlx5vf_free_data_buffer(migf->buf[0]);
out_pd:
mlx5vf_cmd_dealloc_pd(migf);
out_free:
@ -1019,6 +1158,7 @@ void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
cancel_work_sync(&mvdev->saving_migf->async_data.work);
mlx5vf_disable_fd(mvdev->saving_migf);
wake_up_interruptible(&mvdev->saving_migf->poll_wait);
mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
fput(mvdev->saving_migf->filp);
mvdev->saving_migf = NULL;
@ -1100,7 +1240,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
ret = mlx5vf_cmd_load_vhca_state(mvdev,
mvdev->resuming_migf,
mvdev->resuming_migf->buf);
mvdev->resuming_migf->buf[0]);
if (ret)
return ERR_PTR(ret);
}
@ -1194,13 +1334,14 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
struct mlx5vf_pci_core_device *mvdev = container_of(
vdev, struct mlx5vf_pci_core_device, core_device.vdev);
size_t state_size;
u64 total_size;
int ret;
mutex_lock(&mvdev->state_mutex);
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
&state_size, 0);
ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size,
&total_size, 0);
if (!ret)
*stop_copy_length = state_size;
*stop_copy_length = total_size;
mlx5vf_state_mutex_unlock(mvdev);
return ret;
}

View File

@ -946,6 +946,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
unsigned long last;
comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);
/* Empty list */
if (WARN_ON_ONCE(!comb_start))
return;
curr = comb_start;
while (curr) {
last = curr->last;
@ -975,6 +980,11 @@ void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
prev = curr;
curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
}
/* Empty list or no nodes to combine */
if (WARN_ON_ONCE(min_gap == ULONG_MAX))
break;
comb_start->last = comb_end->last;
interval_tree_remove(comb_end, root);
cur_nodes--;

View File

@ -21,11 +21,13 @@
struct cdx_controller;
enum {
CDX_DEV_BUS_MASTER_CONF,
CDX_DEV_RESET_CONF,
};
struct cdx_device_config {
u8 type;
bool bus_master_enable;
};
typedef int (*cdx_scan_cb)(struct cdx_controller *cdx);
@ -170,4 +172,20 @@ extern struct bus_type cdx_bus_type;
*/
int cdx_dev_reset(struct device *dev);
/**
* cdx_set_master - enables bus-mastering for CDX device
* @cdx_dev: the CDX device to enable
*
* Return: 0 for success, -errno on failure
*/
int cdx_set_master(struct cdx_device *cdx_dev);
/**
* cdx_clear_master - disables bus-mastering for CDX device
* @cdx_dev: the CDX device to disable
*
* Return: 0 for success, -errno on failure
*/
int cdx_clear_master(struct cdx_device *cdx_dev);
#endif /* _CDX_BUS_H_ */

View File

@ -277,8 +277,8 @@ struct vfio_region_info {
#define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */
__u32 index; /* Region index */
__u32 cap_offset; /* Offset within info struct of first cap */
__u64 size; /* Region size (bytes) */
__u64 offset; /* Region offset from start of device fd */
__aligned_u64 size; /* Region size (bytes) */
__aligned_u64 offset; /* Region offset from start of device fd */
};
#define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8)
@ -294,8 +294,8 @@ struct vfio_region_info {
#define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1
struct vfio_region_sparse_mmap_area {
__u64 offset; /* Offset of mmap'able area within region */
__u64 size; /* Size of mmap'able area */
__aligned_u64 offset; /* Offset of mmap'able area within region */
__aligned_u64 size; /* Size of mmap'able area */
};
struct vfio_region_info_cap_sparse_mmap {
@ -450,9 +450,9 @@ struct vfio_device_migration_info {
VFIO_DEVICE_STATE_V1_RESUMING)
__u32 reserved;
__u64 pending_bytes;
__u64 data_offset;
__u64 data_size;
__aligned_u64 pending_bytes;
__aligned_u64 data_offset;
__aligned_u64 data_size;
};
/*
@ -476,7 +476,7 @@ struct vfio_device_migration_info {
struct vfio_region_info_cap_nvlink2_ssatgt {
struct vfio_info_cap_header header;
__u64 tgt;
__aligned_u64 tgt;
};
/*
@ -816,7 +816,7 @@ struct vfio_device_gfx_plane_info {
__u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */
/* out */
__u32 drm_format; /* drm format of plane */
__u64 drm_format_mod; /* tiled mode */
__aligned_u64 drm_format_mod; /* tiled mode */
__u32 width; /* width of plane */
__u32 height; /* height of plane */
__u32 stride; /* stride of plane */
@ -829,6 +829,7 @@ struct vfio_device_gfx_plane_info {
__u32 region_index; /* region index */
__u32 dmabuf_id; /* dma-buf id */
};
__u32 reserved;
};
#define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14)
@ -863,9 +864,10 @@ struct vfio_device_ioeventfd {
#define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */
#define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */
#define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf)
__u64 offset; /* device fd offset of write */
__u64 data; /* data to be written */
__aligned_u64 offset; /* device fd offset of write */
__aligned_u64 data; /* data to be written */
__s32 fd; /* -1 for de-assignment */
__u32 reserved;
};
#define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16)
@ -1434,6 +1436,27 @@ struct vfio_device_feature_mig_data_size {
#define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9
/**
* Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device
* based on the operation specified in op flag.
*
* The functionality is incorporated for devices that needs bus master control,
* but the in-band device interface lacks the support. Consequently, it is not
* applicable to PCI devices, as bus master control for PCI devices is managed
* in-band through the configuration space. At present, this feature is supported
* only for CDX devices.
* When the device's BUS MASTER setting is configured as CLEAR, it will result in
* blocking all incoming DMA requests from the device. On the other hand, configuring
* the device's BUS MASTER setting as SET (enable) will grant the device the
* capability to perform DMA to the host memory.
*/
struct vfio_device_feature_bus_master {
__u32 op;
#define VFIO_DEVICE_FEATURE_CLEAR_MASTER 0 /* Clear Bus Master */
#define VFIO_DEVICE_FEATURE_SET_MASTER 1 /* Set Bus Master */
};
#define VFIO_DEVICE_FEATURE_BUS_MASTER 10
/* -------- API for Type1 VFIO IOMMU -------- */
/**
@ -1449,7 +1472,7 @@ struct vfio_iommu_type1_info {
__u32 flags;
#define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */
#define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */
__u64 iova_pgsizes; /* Bitmap of supported page sizes */
__aligned_u64 iova_pgsizes; /* Bitmap of supported page sizes */
__u32 cap_offset; /* Offset within info struct of first cap */
__u32 pad;
};

View File

@ -1262,7 +1262,7 @@ static long mbochs_ioctl(struct vfio_device *vdev, unsigned int cmd,
case VFIO_DEVICE_QUERY_GFX_PLANE:
{
struct vfio_device_gfx_plane_info plane;
struct vfio_device_gfx_plane_info plane = {};
minsz = offsetofend(struct vfio_device_gfx_plane_info,
region_index);

View File

@ -591,7 +591,7 @@ static long mdpy_ioctl(struct vfio_device *vdev, unsigned int cmd,
case VFIO_DEVICE_QUERY_GFX_PLANE:
{
struct vfio_device_gfx_plane_info plane;
struct vfio_device_gfx_plane_info plane = {};
minsz = offsetofend(struct vfio_device_gfx_plane_info,
region_index);

View File

@ -29,6 +29,8 @@
#include <linux/serial.h>
#include <uapi/linux/serial_reg.h>
#include <linux/eventfd.h>
#include <linux/anon_inodes.h>
/*
* #defines
*/
@ -124,10 +126,32 @@ struct serial_port {
u8 intr_trigger_level; /* interrupt trigger level */
};
struct mtty_data {
u64 magic;
#define MTTY_MAGIC 0x7e9d09898c3e2c4e /* Nothing clever, just random */
u32 major_ver;
#define MTTY_MAJOR_VER 1
u32 minor_ver;
#define MTTY_MINOR_VER 0
u32 nr_ports;
u32 flags;
struct serial_port ports[2];
};
struct mdev_state;
struct mtty_migration_file {
struct file *filp;
struct mutex lock;
struct mdev_state *mdev_state;
struct mtty_data data;
ssize_t filled_size;
u8 disabled:1;
};
/* State of each mdev device */
struct mdev_state {
struct vfio_device vdev;
int irq_fd;
struct eventfd_ctx *intx_evtfd;
struct eventfd_ctx *msi_evtfd;
int irq_index;
@ -141,6 +165,13 @@ struct mdev_state {
struct mutex rxtx_lock;
struct vfio_device_info dev_info;
int nr_ports;
enum vfio_device_mig_state state;
struct mutex state_mutex;
struct mutex reset_mutex;
struct mtty_migration_file *saving_migf;
struct mtty_migration_file *resuming_migf;
u8 deferred_reset:1;
u8 intx_mask:1;
};
static struct mtty_type {
@ -166,10 +197,6 @@ static const struct file_operations vd_fops = {
static const struct vfio_device_ops mtty_dev_ops;
/* function prototypes */
static int mtty_trigger_interrupt(struct mdev_state *mdev_state);
/* Helper functions */
static void dump_buffer(u8 *buf, uint32_t count)
@ -186,6 +213,36 @@ static void dump_buffer(u8 *buf, uint32_t count)
#endif
}
static bool is_intx(struct mdev_state *mdev_state)
{
return mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX;
}
static bool is_msi(struct mdev_state *mdev_state)
{
return mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX;
}
static bool is_noirq(struct mdev_state *mdev_state)
{
return !is_intx(mdev_state) && !is_msi(mdev_state);
}
static void mtty_trigger_interrupt(struct mdev_state *mdev_state)
{
lockdep_assert_held(&mdev_state->ops_lock);
if (is_msi(mdev_state)) {
if (mdev_state->msi_evtfd)
eventfd_signal(mdev_state->msi_evtfd, 1);
} else if (is_intx(mdev_state)) {
if (mdev_state->intx_evtfd && !mdev_state->intx_mask) {
eventfd_signal(mdev_state->intx_evtfd, 1);
mdev_state->intx_mask = true;
}
}
}
static void mtty_create_config_space(struct mdev_state *mdev_state)
{
/* PCI dev ID */
@ -717,6 +774,543 @@ static ssize_t mdev_access(struct mdev_state *mdev_state, u8 *buf, size_t count,
return ret;
}
static size_t mtty_data_size(struct mdev_state *mdev_state)
{
return offsetof(struct mtty_data, ports) +
(mdev_state->nr_ports * sizeof(struct serial_port));
}
static void mtty_disable_file(struct mtty_migration_file *migf)
{
mutex_lock(&migf->lock);
migf->disabled = true;
migf->filled_size = 0;
migf->filp->f_pos = 0;
mutex_unlock(&migf->lock);
}
static void mtty_disable_files(struct mdev_state *mdev_state)
{
if (mdev_state->saving_migf) {
mtty_disable_file(mdev_state->saving_migf);
fput(mdev_state->saving_migf->filp);
mdev_state->saving_migf = NULL;
}
if (mdev_state->resuming_migf) {
mtty_disable_file(mdev_state->resuming_migf);
fput(mdev_state->resuming_migf->filp);
mdev_state->resuming_migf = NULL;
}
}
static void mtty_state_mutex_unlock(struct mdev_state *mdev_state)
{
again:
mutex_lock(&mdev_state->reset_mutex);
if (mdev_state->deferred_reset) {
mdev_state->deferred_reset = false;
mutex_unlock(&mdev_state->reset_mutex);
mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
mtty_disable_files(mdev_state);
goto again;
}
mutex_unlock(&mdev_state->state_mutex);
mutex_unlock(&mdev_state->reset_mutex);
}
static int mtty_release_migf(struct inode *inode, struct file *filp)
{
struct mtty_migration_file *migf = filp->private_data;
mtty_disable_file(migf);
mutex_destroy(&migf->lock);
kfree(migf);
return 0;
}
static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
struct mtty_migration_file *migf = filp->private_data;
struct mdev_state *mdev_state = migf->mdev_state;
loff_t *pos = &filp->f_pos;
struct vfio_precopy_info info = {};
unsigned long minsz;
int ret;
if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
return -ENOTTY;
minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
if (copy_from_user(&info, (void __user *)arg, minsz))
return -EFAULT;
if (info.argsz < minsz)
return -EINVAL;
mutex_lock(&mdev_state->state_mutex);
if (mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY &&
mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
ret = -EINVAL;
goto unlock;
}
mutex_lock(&migf->lock);
if (migf->disabled) {
mutex_unlock(&migf->lock);
ret = -ENODEV;
goto unlock;
}
if (*pos > migf->filled_size) {
mutex_unlock(&migf->lock);
ret = -EINVAL;
goto unlock;
}
info.dirty_bytes = 0;
info.initial_bytes = migf->filled_size - *pos;
mutex_unlock(&migf->lock);
ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0;
unlock:
mtty_state_mutex_unlock(mdev_state);
return ret;
}
static ssize_t mtty_save_read(struct file *filp, char __user *buf,
size_t len, loff_t *pos)
{
struct mtty_migration_file *migf = filp->private_data;
ssize_t ret = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
mutex_lock(&migf->lock);
dev_dbg(migf->mdev_state->vdev.dev, "%s ask %zu\n", __func__, len);
if (migf->disabled) {
ret = -ENODEV;
goto out_unlock;
}
if (*pos > migf->filled_size) {
ret = -EINVAL;
goto out_unlock;
}
len = min_t(size_t, migf->filled_size - *pos, len);
if (len) {
if (copy_to_user(buf, (void *)&migf->data + *pos, len)) {
ret = -EFAULT;
goto out_unlock;
}
*pos += len;
ret = len;
}
out_unlock:
dev_dbg(migf->mdev_state->vdev.dev, "%s read %zu\n", __func__, ret);
mutex_unlock(&migf->lock);
return ret;
}
static const struct file_operations mtty_save_fops = {
.owner = THIS_MODULE,
.read = mtty_save_read,
.unlocked_ioctl = mtty_precopy_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.release = mtty_release_migf,
.llseek = no_llseek,
};
static void mtty_save_state(struct mdev_state *mdev_state)
{
struct mtty_migration_file *migf = mdev_state->saving_migf;
int i;
mutex_lock(&migf->lock);
for (i = 0; i < mdev_state->nr_ports; i++) {
memcpy(&migf->data.ports[i],
&mdev_state->s[i], sizeof(struct serial_port));
migf->filled_size += sizeof(struct serial_port);
}
dev_dbg(mdev_state->vdev.dev,
"%s filled to %zu\n", __func__, migf->filled_size);
mutex_unlock(&migf->lock);
}
static int mtty_load_state(struct mdev_state *mdev_state)
{
struct mtty_migration_file *migf = mdev_state->resuming_migf;
int i;
mutex_lock(&migf->lock);
/* magic and version already tested by resume write fn */
if (migf->filled_size < mtty_data_size(mdev_state)) {
dev_dbg(mdev_state->vdev.dev, "%s expected %zu, got %zu\n",
__func__, mtty_data_size(mdev_state),
migf->filled_size);
mutex_unlock(&migf->lock);
return -EINVAL;
}
for (i = 0; i < mdev_state->nr_ports; i++)
memcpy(&mdev_state->s[i],
&migf->data.ports[i], sizeof(struct serial_port));
mutex_unlock(&migf->lock);
return 0;
}
static struct mtty_migration_file *
mtty_save_device_data(struct mdev_state *mdev_state,
enum vfio_device_mig_state state)
{
struct mtty_migration_file *migf = mdev_state->saving_migf;
struct mtty_migration_file *ret = NULL;
if (migf) {
if (state == VFIO_DEVICE_STATE_STOP_COPY)
goto fill_data;
return ret;
}
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
if (!migf)
return ERR_PTR(-ENOMEM);
migf->filp = anon_inode_getfile("mtty_mig", &mtty_save_fops,
migf, O_RDONLY);
if (IS_ERR(migf->filp)) {
int rc = PTR_ERR(migf->filp);
kfree(migf);
return ERR_PTR(rc);
}
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
migf->mdev_state = mdev_state;
migf->data.magic = MTTY_MAGIC;
migf->data.major_ver = MTTY_MAJOR_VER;
migf->data.minor_ver = MTTY_MINOR_VER;
migf->data.nr_ports = mdev_state->nr_ports;
migf->filled_size = offsetof(struct mtty_data, ports);
dev_dbg(mdev_state->vdev.dev, "%s filled header to %zu\n",
__func__, migf->filled_size);
ret = mdev_state->saving_migf = migf;
fill_data:
if (state == VFIO_DEVICE_STATE_STOP_COPY)
mtty_save_state(mdev_state);
return ret;
}
static ssize_t mtty_resume_write(struct file *filp, const char __user *buf,
size_t len, loff_t *pos)
{
struct mtty_migration_file *migf = filp->private_data;
struct mdev_state *mdev_state = migf->mdev_state;
loff_t requested_length;
ssize_t ret = 0;
if (pos)
return -ESPIPE;
pos = &filp->f_pos;
if (*pos < 0 ||
check_add_overflow((loff_t)len, *pos, &requested_length))
return -EINVAL;
if (requested_length > mtty_data_size(mdev_state))
return -ENOMEM;
mutex_lock(&migf->lock);
if (migf->disabled) {
ret = -ENODEV;
goto out_unlock;
}
if (copy_from_user((void *)&migf->data + *pos, buf, len)) {
ret = -EFAULT;
goto out_unlock;
}
*pos += len;
ret = len;
dev_dbg(migf->mdev_state->vdev.dev, "%s received %zu, total %zu\n",
__func__, len, migf->filled_size + len);
if (migf->filled_size < offsetof(struct mtty_data, ports) &&
migf->filled_size + len >= offsetof(struct mtty_data, ports)) {
if (migf->data.magic != MTTY_MAGIC || migf->data.flags ||
migf->data.major_ver != MTTY_MAJOR_VER ||
migf->data.minor_ver != MTTY_MINOR_VER ||
migf->data.nr_ports != mdev_state->nr_ports) {
dev_dbg(migf->mdev_state->vdev.dev,
"%s failed validation\n", __func__);
ret = -EFAULT;
} else {
dev_dbg(migf->mdev_state->vdev.dev,
"%s header validated\n", __func__);
}
}
migf->filled_size += len;
out_unlock:
mutex_unlock(&migf->lock);
return ret;
}
static const struct file_operations mtty_resume_fops = {
.owner = THIS_MODULE,
.write = mtty_resume_write,
.release = mtty_release_migf,
.llseek = no_llseek,
};
static struct mtty_migration_file *
mtty_resume_device_data(struct mdev_state *mdev_state)
{
struct mtty_migration_file *migf;
int ret;
migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
if (!migf)
return ERR_PTR(-ENOMEM);
migf->filp = anon_inode_getfile("mtty_mig", &mtty_resume_fops,
migf, O_WRONLY);
if (IS_ERR(migf->filp)) {
ret = PTR_ERR(migf->filp);
kfree(migf);
return ERR_PTR(ret);
}
stream_open(migf->filp->f_inode, migf->filp);
mutex_init(&migf->lock);
migf->mdev_state = mdev_state;
mdev_state->resuming_migf = migf;
return migf;
}
static struct file *mtty_step_state(struct mdev_state *mdev_state,
enum vfio_device_mig_state new)
{
enum vfio_device_mig_state cur = mdev_state->state;
dev_dbg(mdev_state->vdev.dev, "%s: %d -> %d\n", __func__, cur, new);
/*
* The following state transitions are no-op considering
* mtty does not do DMA nor require any explicit start/stop.
*
* RUNNING -> RUNNING_P2P
* RUNNING_P2P -> RUNNING
* RUNNING_P2P -> STOP
* PRE_COPY -> PRE_COPY_P2P
* PRE_COPY_P2P -> PRE_COPY
* STOP -> RUNNING_P2P
*/
if ((cur == VFIO_DEVICE_STATE_RUNNING &&
new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
(new == VFIO_DEVICE_STATE_RUNNING ||
new == VFIO_DEVICE_STATE_STOP)) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY &&
new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
new == VFIO_DEVICE_STATE_PRE_COPY) ||
(cur == VFIO_DEVICE_STATE_STOP &&
new == VFIO_DEVICE_STATE_RUNNING_P2P))
return NULL;
/*
* The following state transitions simply close migration files,
* with the exception of RESUMING -> STOP, which needs to load
* the state first.
*
* RESUMING -> STOP
* PRE_COPY -> RUNNING
* PRE_COPY_P2P -> RUNNING_P2P
* STOP_COPY -> STOP
*/
if (cur == VFIO_DEVICE_STATE_RESUMING &&
new == VFIO_DEVICE_STATE_STOP) {
int ret;
ret = mtty_load_state(mdev_state);
if (ret)
return ERR_PTR(ret);
mtty_disable_files(mdev_state);
return NULL;
}
if ((cur == VFIO_DEVICE_STATE_PRE_COPY &&
new == VFIO_DEVICE_STATE_RUNNING) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
(cur == VFIO_DEVICE_STATE_STOP_COPY &&
new == VFIO_DEVICE_STATE_STOP)) {
mtty_disable_files(mdev_state);
return NULL;
}
/*
* The following state transitions return migration files.
*
* RUNNING -> PRE_COPY
* RUNNING_P2P -> PRE_COPY_P2P
* STOP -> STOP_COPY
* STOP -> RESUMING
* PRE_COPY_P2P -> STOP_COPY
*/
if ((cur == VFIO_DEVICE_STATE_RUNNING &&
new == VFIO_DEVICE_STATE_PRE_COPY) ||
(cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
new == VFIO_DEVICE_STATE_PRE_COPY_P2P) ||
(cur == VFIO_DEVICE_STATE_STOP &&
new == VFIO_DEVICE_STATE_STOP_COPY) ||
(cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
new == VFIO_DEVICE_STATE_STOP_COPY)) {
struct mtty_migration_file *migf;
migf = mtty_save_device_data(mdev_state, new);
if (IS_ERR(migf))
return ERR_CAST(migf);
if (migf) {
get_file(migf->filp);
return migf->filp;
}
return NULL;
}
if (cur == VFIO_DEVICE_STATE_STOP &&
new == VFIO_DEVICE_STATE_RESUMING) {
struct mtty_migration_file *migf;
migf = mtty_resume_device_data(mdev_state);
if (IS_ERR(migf))
return ERR_CAST(migf);
get_file(migf->filp);
return migf->filp;
}
/* vfio_mig_get_next_state() does not use arcs other than the above */
WARN_ON(true);
return ERR_PTR(-EINVAL);
}
static struct file *mtty_set_state(struct vfio_device *vdev,
enum vfio_device_mig_state new_state)
{
struct mdev_state *mdev_state =
container_of(vdev, struct mdev_state, vdev);
struct file *ret = NULL;
dev_dbg(vdev->dev, "%s -> %d\n", __func__, new_state);
mutex_lock(&mdev_state->state_mutex);
while (mdev_state->state != new_state) {
enum vfio_device_mig_state next_state;
int rc = vfio_mig_get_next_state(vdev, mdev_state->state,
new_state, &next_state);
if (rc) {
ret = ERR_PTR(rc);
break;
}
ret = mtty_step_state(mdev_state, next_state);
if (IS_ERR(ret))
break;
mdev_state->state = next_state;
if (WARN_ON(ret && new_state != next_state)) {
fput(ret);
ret = ERR_PTR(-EINVAL);
break;
}
}
mtty_state_mutex_unlock(mdev_state);
return ret;
}
static int mtty_get_state(struct vfio_device *vdev,
enum vfio_device_mig_state *current_state)
{
struct mdev_state *mdev_state =
container_of(vdev, struct mdev_state, vdev);
mutex_lock(&mdev_state->state_mutex);
*current_state = mdev_state->state;
mtty_state_mutex_unlock(mdev_state);
return 0;
}
static int mtty_get_data_size(struct vfio_device *vdev,
unsigned long *stop_copy_length)
{
struct mdev_state *mdev_state =
container_of(vdev, struct mdev_state, vdev);
*stop_copy_length = mtty_data_size(mdev_state);
return 0;
}
static const struct vfio_migration_ops mtty_migration_ops = {
.migration_set_state = mtty_set_state,
.migration_get_state = mtty_get_state,
.migration_get_data_size = mtty_get_data_size,
};
static int mtty_log_start(struct vfio_device *vdev,
struct rb_root_cached *ranges,
u32 nnodes, u64 *page_size)
{
return 0;
}
static int mtty_log_stop(struct vfio_device *vdev)
{
return 0;
}
static int mtty_log_read_and_clear(struct vfio_device *vdev,
unsigned long iova, unsigned long length,
struct iova_bitmap *dirty)
{
return 0;
}
static const struct vfio_log_ops mtty_log_ops = {
.log_start = mtty_log_start,
.log_stop = mtty_log_stop,
.log_read_and_clear = mtty_log_read_and_clear,
};
static int mtty_init_dev(struct vfio_device *vdev)
{
struct mdev_state *mdev_state =
@ -749,6 +1343,16 @@ static int mtty_init_dev(struct vfio_device *vdev)
mutex_init(&mdev_state->ops_lock);
mdev_state->mdev = mdev;
mtty_create_config_space(mdev_state);
mutex_init(&mdev_state->state_mutex);
mutex_init(&mdev_state->reset_mutex);
vdev->migration_flags = VFIO_MIGRATION_STOP_COPY |
VFIO_MIGRATION_P2P |
VFIO_MIGRATION_PRE_COPY;
vdev->mig_ops = &mtty_migration_ops;
vdev->log_ops = &mtty_log_ops;
mdev_state->state = VFIO_DEVICE_STATE_RUNNING;
return 0;
err_nr_ports:
@ -782,6 +1386,8 @@ static void mtty_release_dev(struct vfio_device *vdev)
struct mdev_state *mdev_state =
container_of(vdev, struct mdev_state, vdev);
mutex_destroy(&mdev_state->reset_mutex);
mutex_destroy(&mdev_state->state_mutex);
atomic_add(mdev_state->nr_ports, &mdev_avail_ports);
kfree(mdev_state->vconfig);
}
@ -798,6 +1404,15 @@ static int mtty_reset(struct mdev_state *mdev_state)
{
pr_info("%s: called\n", __func__);
mutex_lock(&mdev_state->reset_mutex);
mdev_state->deferred_reset = true;
if (!mutex_trylock(&mdev_state->state_mutex)) {
mutex_unlock(&mdev_state->reset_mutex);
return 0;
}
mutex_unlock(&mdev_state->reset_mutex);
mtty_state_mutex_unlock(mdev_state);
return 0;
}
@ -921,6 +1536,25 @@ static ssize_t mtty_write(struct vfio_device *vdev, const char __user *buf,
return -EFAULT;
}
static void mtty_disable_intx(struct mdev_state *mdev_state)
{
if (mdev_state->intx_evtfd) {
eventfd_ctx_put(mdev_state->intx_evtfd);
mdev_state->intx_evtfd = NULL;
mdev_state->intx_mask = false;
mdev_state->irq_index = -1;
}
}
static void mtty_disable_msi(struct mdev_state *mdev_state)
{
if (mdev_state->msi_evtfd) {
eventfd_ctx_put(mdev_state->msi_evtfd);
mdev_state->msi_evtfd = NULL;
mdev_state->irq_index = -1;
}
}
static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
unsigned int index, unsigned int start,
unsigned int count, void *data)
@ -932,59 +1566,113 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
case VFIO_PCI_INTX_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_MASK:
if (!is_intx(mdev_state) || start != 0 || count != 1) {
ret = -EINVAL;
break;
}
if (flags & VFIO_IRQ_SET_DATA_NONE) {
mdev_state->intx_mask = true;
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t mask = *(uint8_t *)data;
if (mask)
mdev_state->intx_mask = true;
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
ret = -ENOTTY; /* No support for mask fd */
}
break;
case VFIO_IRQ_SET_ACTION_UNMASK:
if (!is_intx(mdev_state) || start != 0 || count != 1) {
ret = -EINVAL;
break;
}
if (flags & VFIO_IRQ_SET_DATA_NONE) {
mdev_state->intx_mask = false;
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t mask = *(uint8_t *)data;
if (mask)
mdev_state->intx_mask = false;
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
ret = -ENOTTY; /* No support for unmask fd */
}
break;
case VFIO_IRQ_SET_ACTION_TRIGGER:
{
if (flags & VFIO_IRQ_SET_DATA_NONE) {
pr_info("%s: disable INTx\n", __func__);
if (mdev_state->intx_evtfd)
eventfd_ctx_put(mdev_state->intx_evtfd);
if (is_intx(mdev_state) && !count &&
(flags & VFIO_IRQ_SET_DATA_NONE)) {
mtty_disable_intx(mdev_state);
break;
}
if (!(is_intx(mdev_state) || is_noirq(mdev_state)) ||
start != 0 || count != 1) {
ret = -EINVAL;
break;
}
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int fd = *(int *)data;
struct eventfd_ctx *evt;
if (fd > 0) {
struct eventfd_ctx *evt;
mtty_disable_intx(mdev_state);
evt = eventfd_ctx_fdget(fd);
if (IS_ERR(evt)) {
ret = PTR_ERR(evt);
break;
}
mdev_state->intx_evtfd = evt;
mdev_state->irq_fd = fd;
mdev_state->irq_index = index;
if (fd < 0)
break;
evt = eventfd_ctx_fdget(fd);
if (IS_ERR(evt)) {
ret = PTR_ERR(evt);
break;
}
mdev_state->intx_evtfd = evt;
mdev_state->irq_index = index;
break;
}
if (!is_intx(mdev_state)) {
ret = -EINVAL;
break;
}
if (flags & VFIO_IRQ_SET_DATA_NONE) {
mtty_trigger_interrupt(mdev_state);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
if (trigger)
mtty_trigger_interrupt(mdev_state);
}
break;
}
}
break;
case VFIO_PCI_MSI_IRQ_INDEX:
switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
case VFIO_IRQ_SET_ACTION_MASK:
case VFIO_IRQ_SET_ACTION_UNMASK:
ret = -ENOTTY;
break;
case VFIO_IRQ_SET_ACTION_TRIGGER:
if (flags & VFIO_IRQ_SET_DATA_NONE) {
if (mdev_state->msi_evtfd)
eventfd_ctx_put(mdev_state->msi_evtfd);
pr_info("%s: disable MSI\n", __func__);
mdev_state->irq_index = VFIO_PCI_INTX_IRQ_INDEX;
if (is_msi(mdev_state) && !count &&
(flags & VFIO_IRQ_SET_DATA_NONE)) {
mtty_disable_msi(mdev_state);
break;
}
if (!(is_msi(mdev_state) || is_noirq(mdev_state)) ||
start != 0 || count != 1) {
ret = -EINVAL;
break;
}
if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int fd = *(int *)data;
struct eventfd_ctx *evt;
if (fd <= 0)
break;
mtty_disable_msi(mdev_state);
if (mdev_state->msi_evtfd)
if (fd < 0)
break;
evt = eventfd_ctx_fdget(fd);
@ -993,20 +1681,37 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
break;
}
mdev_state->msi_evtfd = evt;
mdev_state->irq_fd = fd;
mdev_state->irq_index = index;
break;
}
if (!is_msi(mdev_state)) {
ret = -EINVAL;
break;
}
if (flags & VFIO_IRQ_SET_DATA_NONE) {
mtty_trigger_interrupt(mdev_state);
} else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
uint8_t trigger = *(uint8_t *)data;
if (trigger)
mtty_trigger_interrupt(mdev_state);
}
break;
}
break;
}
break;
case VFIO_PCI_MSIX_IRQ_INDEX:
pr_info("%s: MSIX_IRQ\n", __func__);
dev_dbg(mdev_state->vdev.dev, "%s: MSIX_IRQ\n", __func__);
ret = -ENOTTY;
break;
case VFIO_PCI_ERR_IRQ_INDEX:
pr_info("%s: ERR_IRQ\n", __func__);
dev_dbg(mdev_state->vdev.dev, "%s: ERR_IRQ\n", __func__);
ret = -ENOTTY;
break;
case VFIO_PCI_REQ_IRQ_INDEX:
pr_info("%s: REQ_IRQ\n", __func__);
dev_dbg(mdev_state->vdev.dev, "%s: REQ_IRQ\n", __func__);
ret = -ENOTTY;
break;
}
@ -1014,33 +1719,6 @@ static int mtty_set_irqs(struct mdev_state *mdev_state, uint32_t flags,
return ret;
}
static int mtty_trigger_interrupt(struct mdev_state *mdev_state)
{
int ret = -1;
if ((mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX) &&
(!mdev_state->msi_evtfd))
return -EINVAL;
else if ((mdev_state->irq_index == VFIO_PCI_INTX_IRQ_INDEX) &&
(!mdev_state->intx_evtfd)) {
pr_info("%s: Intr eventfd not found\n", __func__);
return -EINVAL;
}
if (mdev_state->irq_index == VFIO_PCI_MSI_IRQ_INDEX)
ret = eventfd_signal(mdev_state->msi_evtfd, 1);
else
ret = eventfd_signal(mdev_state->intx_evtfd, 1);
#if defined(DEBUG_INTR)
pr_info("Intx triggered\n");
#endif
if (ret != 1)
pr_err("%s: eventfd signal failed (%d)\n", __func__, ret);
return ret;
}
static int mtty_get_region_info(struct mdev_state *mdev_state,
struct vfio_region_info *region_info,
u16 *cap_type_id, void **cap_type)
@ -1084,22 +1762,16 @@ static int mtty_get_region_info(struct mdev_state *mdev_state,
static int mtty_get_irq_info(struct vfio_irq_info *irq_info)
{
switch (irq_info->index) {
case VFIO_PCI_INTX_IRQ_INDEX:
case VFIO_PCI_MSI_IRQ_INDEX:
case VFIO_PCI_REQ_IRQ_INDEX:
break;
default:
if (irq_info->index != VFIO_PCI_INTX_IRQ_INDEX &&
irq_info->index != VFIO_PCI_MSI_IRQ_INDEX)
return -EINVAL;
}
irq_info->flags = VFIO_IRQ_INFO_EVENTFD;
irq_info->count = 1;
if (irq_info->index == VFIO_PCI_INTX_IRQ_INDEX)
irq_info->flags |= (VFIO_IRQ_INFO_MASKABLE |
VFIO_IRQ_INFO_AUTOMASKED);
irq_info->flags |= VFIO_IRQ_INFO_MASKABLE |
VFIO_IRQ_INFO_AUTOMASKED;
else
irq_info->flags |= VFIO_IRQ_INFO_NORESIZE;
@ -1262,6 +1934,16 @@ static unsigned int mtty_get_available(struct mdev_type *mtype)
return atomic_read(&mdev_avail_ports) / type->nr_ports;
}
static void mtty_close(struct vfio_device *vdev)
{
struct mdev_state *mdev_state =
container_of(vdev, struct mdev_state, vdev);
mtty_disable_files(mdev_state);
mtty_disable_intx(mdev_state);
mtty_disable_msi(mdev_state);
}
static const struct vfio_device_ops mtty_dev_ops = {
.name = "vfio-mtty",
.init = mtty_init_dev,
@ -1273,6 +1955,7 @@ static const struct vfio_device_ops mtty_dev_ops = {
.unbind_iommufd = vfio_iommufd_emulated_unbind,
.attach_ioas = vfio_iommufd_emulated_attach_ioas,
.detach_ioas = vfio_iommufd_emulated_detach_ioas,
.close_device = mtty_close,
};
static struct mdev_driver mtty_driver = {