mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-17 18:36:00 +00:00
The DSA (memory copy/zero/etc) and IAA (compression) accelerators in the
Sapphire Rapids and Emerald Rapids SOCs turn out to have a bug that has security implications. Both of these accelerators work by the application submitting a 64 byte command to the device; this command contains an opcode as well as the virtual address of the return value that the device will update on completion... and a set of opcode specific values. In a typical scenario a ring 3 application mmaps the device file and uses the ENQCMD or MOVDIR64 instructions (which are variations of a 64 byte atomic write) on this mmap'd memory region to directly submit commands to a device hardware. The return value as specified in the command, is supposed to be 32 (or 64) bytes aligned in memory, and generally the hardware checks and enforces this alignment. However in testing it has been found that there are conditions (controlled by the submitter) where this enforcement does not happen.... which makes it possible for the return value to span a page boundary. And this is where it goes wrong - the accelerators will perform the virtual to physical address lookup on the first of the two pages, but end up continue writing to the next consecutive physical (host) page rather than the consecutive virtual page. In addition, the device will end up in a hung state on such unaligned write of the return value. This patch series has the proposed software side solution consisting of 3 parts part 1: Don't allow these two PCI devices to be assigned to VM guests (we cannot trust a VM guest to behave correctly and not cause this condition) part 2: Don't allow ring 3 applications to set up the mmap unless they have CAP_SYS_RAWIO permissions. This makes it no longer possible for non-root applications to directly submit commands to the accelerator part 3: Add a write() method to the device so that an application can submit its commands to the kernel driver, which performs the needed sanity checks before submitting it to the hardware. This switch from mmap to write is an incompatible interface change to non-root userspace, but we have not found a way to avoid this. All software we know of uses a small set of accessor libraries for these accelerators, for which libqpl and libdml (on github) are the most common. As part of the security release, updated versions of these libraries will be released that transparently fall back to write(). Intel has assigned CVE-2024-21823 to this hardware issue. -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQR7kcW/9hKL3nKiNhEbbuU1gZlMiwUCZkJGUwAKCRAbbuU1gZlM i9kzAQCzkirMANgOz8YEV3QqxPoE6pVWRL/mrsJw2yVzv3RcVwEA/suqCkBXO5zx H9QzRXV0SGr5DJkra4ktSAwKUIfGYAQ= =8m4L -----END PGP SIGNATURE----- Merge tag 'idxd-for-linus-may2024' of git bundle from Arjan Pull DSA and IAA accelerator mis-alignment fix from Arjan van de Ven: "The DSA (memory copy/zero/etc) and IAA (compression) accelerators in the Sapphire Rapids and Emerald Rapids SOCs turn out to have a bug that has security implications. Both of these accelerators work by the application submitting a 64 byte command to the device; this command contains an opcode as well as the virtual address of the return value that the device will update on completion... and a set of opcode specific values. In a typical scenario a ring 3 application mmaps the device file and uses the ENQCMD or MOVDIR64 instructions (which are variations of a 64 byte atomic write) on this mmap'd memory region to directly submit commands to a device hardware. The return value as specified in the command, is supposed to be 32 (or 64) bytes aligned in memory, and generally the hardware checks and enforces this alignment. However in testing it has been found that there are conditions (controlled by the submitter) where this enforcement does not happen... which makes it possible for the return value to span a page boundary. And this is where it goes wrong - the accelerators will perform the virtual to physical address lookup on the first of the two pages, but end up continue writing to the next consecutive physical (host) page rather than the consecutive virtual page. In addition, the device will end up in a hung state on such unaligned write of the return value. This patch series has the proposed software side solution consisting of three parts: - Don't allow these two PCI devices to be assigned to VM guests (we cannot trust a VM guest to behave correctly and not cause this condition) - Don't allow ring 3 applications to set up the mmap unless they have CAP_SYS_RAWIO permissions. This makes it no longer possible for non-root applications to directly submit commands to the accelerator - Add a write() method to the device so that an application can submit its commands to the kernel driver, which performs the needed sanity checks before submitting it to the hardware. This switch from mmap to write is an incompatible interface change to non-root userspace, but we have not found a way to avoid this. All software we know of uses a small set of accessor libraries for these accelerators, for which libqpl and libdml (on github) are the most common. As part of the security release, updated versions of these libraries will be released that transparently fall back to write(). Intel has assigned CVE-2024-21823 to this hardware issue" * tag 'idxd-for-linus-may2024' of git bundle from Arjan: dmaengine: idxd: add a write() method for applications to submit work dmaengine: idxd: add a new security check to deal with a hardware erratum VFIO: Add the SPR_DSA and SPR_IAX devices to the denylist
This commit is contained in:
commit
796aec4a5b
@ -400,6 +400,18 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
|
||||
int rc;
|
||||
|
||||
dev_dbg(&pdev->dev, "%s called\n", __func__);
|
||||
|
||||
/*
|
||||
* Due to an erratum in some of the devices supported by the driver,
|
||||
* direct user submission to the device can be unsafe.
|
||||
* (See the INTEL-SA-01084 security advisory)
|
||||
*
|
||||
* For the devices that exhibit this behavior, require that the user
|
||||
* has CAP_SYS_RAWIO capabilities.
|
||||
*/
|
||||
if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO))
|
||||
return -EPERM;
|
||||
|
||||
rc = check_vma(wq, vma, __func__);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
@ -414,6 +426,70 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
|
||||
vma->vm_page_prot);
|
||||
}
|
||||
|
||||
static int idxd_submit_user_descriptor(struct idxd_user_context *ctx,
|
||||
struct dsa_hw_desc __user *udesc)
|
||||
{
|
||||
struct idxd_wq *wq = ctx->wq;
|
||||
struct idxd_dev *idxd_dev = &wq->idxd->idxd_dev;
|
||||
const uint64_t comp_addr_align = is_dsa_dev(idxd_dev) ? 0x20 : 0x40;
|
||||
void __iomem *portal = idxd_wq_portal_addr(wq);
|
||||
struct dsa_hw_desc descriptor __aligned(64);
|
||||
int rc;
|
||||
|
||||
rc = copy_from_user(&descriptor, udesc, sizeof(descriptor));
|
||||
if (rc)
|
||||
return -EFAULT;
|
||||
|
||||
/*
|
||||
* DSA devices are capable of indirect ("batch") command submission.
|
||||
* On devices where direct user submissions are not safe, we cannot
|
||||
* allow this since there is no good way for us to verify these
|
||||
* indirect commands.
|
||||
*/
|
||||
if (is_dsa_dev(idxd_dev) && descriptor.opcode == DSA_OPCODE_BATCH &&
|
||||
!wq->idxd->user_submission_safe)
|
||||
return -EINVAL;
|
||||
/*
|
||||
* As per the programming specification, the completion address must be
|
||||
* aligned to 32 or 64 bytes. If this is violated the hardware
|
||||
* engine can get very confused (security issue).
|
||||
*/
|
||||
if (!IS_ALIGNED(descriptor.completion_addr, comp_addr_align))
|
||||
return -EINVAL;
|
||||
|
||||
if (wq_dedicated(wq))
|
||||
iosubmit_cmds512(portal, &descriptor, 1);
|
||||
else {
|
||||
descriptor.priv = 0;
|
||||
descriptor.pasid = ctx->pasid;
|
||||
rc = idxd_enqcmds(wq, portal, &descriptor);
|
||||
if (rc < 0)
|
||||
return rc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t len,
|
||||
loff_t *unused)
|
||||
{
|
||||
struct dsa_hw_desc __user *udesc = (struct dsa_hw_desc __user *)buf;
|
||||
struct idxd_user_context *ctx = filp->private_data;
|
||||
ssize_t written = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) {
|
||||
int rc = idxd_submit_user_descriptor(ctx, udesc + i);
|
||||
|
||||
if (rc)
|
||||
return written ? written : rc;
|
||||
|
||||
written += sizeof(struct dsa_hw_desc);
|
||||
}
|
||||
|
||||
return written;
|
||||
}
|
||||
|
||||
static __poll_t idxd_cdev_poll(struct file *filp,
|
||||
struct poll_table_struct *wait)
|
||||
{
|
||||
@ -436,6 +512,7 @@ static const struct file_operations idxd_cdev_fops = {
|
||||
.open = idxd_cdev_open,
|
||||
.release = idxd_cdev_release,
|
||||
.mmap = idxd_cdev_mmap,
|
||||
.write = idxd_cdev_write,
|
||||
.poll = idxd_cdev_poll,
|
||||
};
|
||||
|
||||
|
@ -288,6 +288,7 @@ struct idxd_driver_data {
|
||||
int evl_cr_off;
|
||||
int cr_status_off;
|
||||
int cr_result_off;
|
||||
bool user_submission_safe;
|
||||
load_device_defaults_fn_t load_device_defaults;
|
||||
};
|
||||
|
||||
@ -374,6 +375,8 @@ struct idxd_device {
|
||||
|
||||
struct dentry *dbgfs_dir;
|
||||
struct dentry *dbgfs_evl_file;
|
||||
|
||||
bool user_submission_safe;
|
||||
};
|
||||
|
||||
static inline unsigned int evl_ent_size(struct idxd_device *idxd)
|
||||
|
@ -47,6 +47,7 @@ static struct idxd_driver_data idxd_driver_data[] = {
|
||||
.align = 32,
|
||||
.dev_type = &dsa_device_type,
|
||||
.evl_cr_off = offsetof(struct dsa_evl_entry, cr),
|
||||
.user_submission_safe = false, /* See INTEL-SA-01084 security advisory */
|
||||
.cr_status_off = offsetof(struct dsa_completion_record, status),
|
||||
.cr_result_off = offsetof(struct dsa_completion_record, result),
|
||||
},
|
||||
@ -57,6 +58,7 @@ static struct idxd_driver_data idxd_driver_data[] = {
|
||||
.align = 64,
|
||||
.dev_type = &iax_device_type,
|
||||
.evl_cr_off = offsetof(struct iax_evl_entry, cr),
|
||||
.user_submission_safe = false, /* See INTEL-SA-01084 security advisory */
|
||||
.cr_status_off = offsetof(struct iax_completion_record, status),
|
||||
.cr_result_off = offsetof(struct iax_completion_record, error_code),
|
||||
.load_device_defaults = idxd_load_iaa_device_defaults,
|
||||
@ -774,6 +776,8 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
|
||||
dev_info(&pdev->dev, "Intel(R) Accelerator Device (v%x)\n",
|
||||
idxd->hw.version);
|
||||
|
||||
idxd->user_submission_safe = data->user_submission_safe;
|
||||
|
||||
return 0;
|
||||
|
||||
err_dev_register:
|
||||
|
@ -6,9 +6,6 @@
|
||||
#include <uapi/linux/idxd.h>
|
||||
|
||||
/* PCI Config */
|
||||
#define PCI_DEVICE_ID_INTEL_DSA_SPR0 0x0b25
|
||||
#define PCI_DEVICE_ID_INTEL_IAX_SPR0 0x0cfe
|
||||
|
||||
#define DEVICE_VERSION_1 0x100
|
||||
#define DEVICE_VERSION_2 0x200
|
||||
|
||||
|
@ -1197,12 +1197,35 @@ static ssize_t wq_enqcmds_retries_store(struct device *dev, struct device_attrib
|
||||
static struct device_attribute dev_attr_wq_enqcmds_retries =
|
||||
__ATTR(enqcmds_retries, 0644, wq_enqcmds_retries_show, wq_enqcmds_retries_store);
|
||||
|
||||
static ssize_t op_cap_show_common(struct device *dev, char *buf, unsigned long *opcap_bmap)
|
||||
{
|
||||
ssize_t pos;
|
||||
int i;
|
||||
|
||||
pos = 0;
|
||||
for (i = IDXD_MAX_OPCAP_BITS/64 - 1; i >= 0; i--) {
|
||||
unsigned long val = opcap_bmap[i];
|
||||
|
||||
/* On systems where direct user submissions are not safe, we need to clear out
|
||||
* the BATCH capability from the capability mask in sysfs since we cannot support
|
||||
* that command on such systems.
|
||||
*/
|
||||
if (i == DSA_OPCODE_BATCH/64 && !confdev_to_idxd(dev)->user_submission_safe)
|
||||
clear_bit(DSA_OPCODE_BATCH % 64, &val);
|
||||
|
||||
pos += sysfs_emit_at(buf, pos, "%*pb", 64, &val);
|
||||
pos += sysfs_emit_at(buf, pos, "%c", i == 0 ? '\n' : ',');
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static ssize_t wq_op_config_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct idxd_wq *wq = confdev_to_wq(dev);
|
||||
|
||||
return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, wq->opcap_bmap);
|
||||
return op_cap_show_common(dev, buf, wq->opcap_bmap);
|
||||
}
|
||||
|
||||
static int idxd_verify_supported_opcap(struct idxd_device *idxd, unsigned long *opmask)
|
||||
@ -1455,7 +1478,7 @@ static ssize_t op_cap_show(struct device *dev,
|
||||
{
|
||||
struct idxd_device *idxd = confdev_to_idxd(dev);
|
||||
|
||||
return sysfs_emit(buf, "%*pb\n", IDXD_MAX_OPCAP_BITS, idxd->opcap_bmap);
|
||||
return op_cap_show_common(dev, buf, idxd->opcap_bmap);
|
||||
}
|
||||
static DEVICE_ATTR_RO(op_cap);
|
||||
|
||||
|
@ -71,6 +71,8 @@ static bool vfio_pci_dev_in_denylist(struct pci_dev *pdev)
|
||||
case PCI_DEVICE_ID_INTEL_QAT_C62X_VF:
|
||||
case PCI_DEVICE_ID_INTEL_QAT_DH895XCC:
|
||||
case PCI_DEVICE_ID_INTEL_QAT_DH895XCC_VF:
|
||||
case PCI_DEVICE_ID_INTEL_DSA_SPR0:
|
||||
case PCI_DEVICE_ID_INTEL_IAX_SPR0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -2687,8 +2687,10 @@
|
||||
#define PCI_DEVICE_ID_INTEL_I960 0x0960
|
||||
#define PCI_DEVICE_ID_INTEL_I960RM 0x0962
|
||||
#define PCI_DEVICE_ID_INTEL_HDA_HSW_0 0x0a0c
|
||||
#define PCI_DEVICE_ID_INTEL_DSA_SPR0 0x0b25
|
||||
#define PCI_DEVICE_ID_INTEL_HDA_HSW_2 0x0c0c
|
||||
#define PCI_DEVICE_ID_INTEL_CENTERTON_ILB 0x0c60
|
||||
#define PCI_DEVICE_ID_INTEL_IAX_SPR0 0x0cfe
|
||||
#define PCI_DEVICE_ID_INTEL_HDA_HSW_3 0x0d0c
|
||||
#define PCI_DEVICE_ID_INTEL_HDA_BYT 0x0f04
|
||||
#define PCI_DEVICE_ID_INTEL_SST_BYT 0x0f28
|
||||
|
Loading…
x
Reference in New Issue
Block a user