habanalabs: add support for notification via eventfd

The driver will be able to send notification events towards
a user process, using user's registered event file descriptor.
The driver uses the notification mechanism to inform the
user about an occurred event.
A user thread can wait until a notification is received from
the driver.

The driver stores the occurred event until the user reads it,
using HL_INFO_GET_EVENTS - new ioctl opcode in the INFO ioctl.

Gaudi specific implementation includes sending a notification
on a TPC assertion event that is received from f/w.

Signed-off-by: Tal Cohen <talcohen@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
Tal Cohen 2022-04-28 13:45:18 +03:00 committed by Greg Kroah-Hartman
parent f2daa2d97e
commit 422ef17103
6 changed files with 182 additions and 13 deletions

View File

@ -285,6 +285,14 @@ static void hpriv_release(struct kref *ref)
hdev->compute_ctx_in_release = 0;
/* release the eventfd */
if (hpriv->notifier_event.eventfd) {
eventfd_ctx_put(hpriv->notifier_event.eventfd);
hpriv->notifier_event.eventfd = 0;
}
mutex_destroy(&hpriv->notifier_event.lock);
kfree(hpriv);
}
@ -355,6 +363,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
list_del(&hpriv->dev_node);
mutex_unlock(&hdev->fpriv_ctrl_list_lock);
out:
/* release the eventfd */
if (hpriv->notifier_event.eventfd) {
eventfd_ctx_put(hpriv->notifier_event.eventfd);
hpriv->notifier_event.eventfd = 0;
}
mutex_destroy(&hpriv->notifier_event.lock);
put_pid(hpriv->taskpid);
kfree(hpriv);
@ -1506,6 +1521,43 @@ out_err:
return rc;
}
static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event)
{
mutex_lock(&notifier_event->lock);
notifier_event->events_mask |= event;
if (notifier_event->eventfd)
eventfd_signal(notifier_event->eventfd, 1);
mutex_unlock(&notifier_event->lock);
}
/*
* hl_notifier_event_send_all - notify all user processes via eventfd
*
* @hdev: pointer to habanalabs device structure
* @event: the occurred event
* Returns 0 for success or an error on failure.
*/
void hl_notifier_event_send_all(struct hl_device *hdev, u64 event)
{
struct hl_fpriv *hpriv;
mutex_lock(&hdev->fpriv_list_lock);
list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
hl_notifier_event_send(&hpriv->notifier_event, event);
mutex_unlock(&hdev->fpriv_list_lock);
/* control device */
mutex_lock(&hdev->fpriv_ctrl_list_lock);
list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node)
hl_notifier_event_send(&hpriv->notifier_event, event);
mutex_unlock(&hdev->fpriv_ctrl_list_lock);
}
/*
* hl_device_init - main initialization function for habanalabs device
*

View File

@ -21,6 +21,7 @@
#include <linux/hashtable.h>
#include <linux/debugfs.h>
#include <linux/rwsem.h>
#include <linux/eventfd.h>
#include <linux/bitfield.h>
#include <linux/genalloc.h>
#include <linux/sched/signal.h>
@ -1932,6 +1933,18 @@ struct hl_debug_params {
bool enable;
};
/**
* struct hl_notifier_event - holds the notifier data structure
* @eventfd: the event file descriptor to raise the notifications
* @lock: mutex lock to protect the notifier data flows
* @events_mask: indicates the bitmap events
*/
struct hl_notifier_event {
struct eventfd_ctx *eventfd;
struct mutex lock;
u64 events_mask;
};
/*
* FILE PRIVATE STRUCTURE
*/
@ -1943,24 +1956,25 @@ struct hl_debug_params {
* @taskpid: current process ID.
* @ctx: current executing context. TODO: remove for multiple ctx per process
* @ctx_mgr: context manager to handle multiple context for this FD.
* @cb_mgr: command buffer manager to handle multiple buffers for this FD.
* @mem_mgr: manager descriptor for memory exportable via mmap
* @notifier_event: notifier eventfd towards user process
* @debugfs_list: list of relevant ASIC debugfs.
* @dev_node: node in the device list of file private data
* @refcount: number of related contexts.
* @restore_phase_mutex: lock for context switch and restore phase.
*/
struct hl_fpriv {
struct hl_device *hdev;
struct file *filp;
struct pid *taskpid;
struct hl_ctx *ctx;
struct hl_ctx_mgr ctx_mgr;
struct hl_mem_mgr mem_mgr;
struct list_head debugfs_list;
struct list_head dev_node;
struct kref refcount;
struct mutex restore_phase_mutex;
struct hl_device *hdev;
struct file *filp;
struct pid *taskpid;
struct hl_ctx *ctx;
struct hl_ctx_mgr ctx_mgr;
struct hl_mem_mgr mem_mgr;
struct hl_notifier_event notifier_event;
struct list_head debugfs_list;
struct list_head dev_node;
struct kref refcount;
struct mutex restore_phase_mutex;
};
@ -2676,8 +2690,8 @@ struct hl_reset_info {
* @state_dump_specs: constants and dictionaries needed to dump system state.
* @multi_cs_completion: array of multi-CS completion.
* @clk_throttling: holds information about current/previous clock throttling events
* @reset_info: holds current device reset information.
* @last_error: holds information about last session in which CS timeout or razwi error occurred.
* @reset_info: holds current device reset information.
* @stream_master_qid_arr: pointer to array with QIDs of master streams.
* @fw_major_version: major version of current loaded preboot
* @dram_used_mem: current DRAM memory consumption.
@ -3071,6 +3085,8 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
int hl_build_hwmon_channel_info(struct hl_device *hdev,
struct cpucp_sensor *sensors_arr);
void hl_notifier_event_send_all(struct hl_device *hdev, u64 event);
int hl_sysfs_init(struct hl_device *hdev);
void hl_sysfs_fini(struct hl_device *hdev);

View File

@ -134,6 +134,10 @@ int hl_device_open(struct inode *inode, struct file *filp)
hpriv->hdev = hdev;
filp->private_data = hpriv;
hpriv->filp = filp;
hpriv->notifier_event.events_mask = 0;
hpriv->notifier_event.eventfd = 0;
mutex_init(&hpriv->notifier_event.lock);
mutex_init(&hpriv->restore_phase_mutex);
kref_init(&hpriv->refcount);
nonseekable_open(inode, filp);
@ -208,6 +212,7 @@ out_err:
hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
filp->private_data = NULL;
mutex_destroy(&hpriv->restore_phase_mutex);
mutex_destroy(&hpriv->notifier_event.lock);
put_pid(hpriv->taskpid);
kfree(hpriv);
@ -241,6 +246,10 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
hpriv->hdev = hdev;
filp->private_data = hpriv;
hpriv->filp = filp;
hpriv->notifier_event.events_mask = 0;
hpriv->notifier_event.eventfd = 0;
mutex_init(&hpriv->notifier_event.lock);
nonseekable_open(inode, filp);
hpriv->taskpid = get_task_pid(current, PIDTYPE_PID);

View File

@ -116,6 +116,25 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate,
return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0;
}
static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
int rc;
u32 max_size = args->return_size;
u64 events_mask;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((max_size < sizeof(u64)) || (!out))
return -EINVAL;
mutex_lock(&hpriv->notifier_event.lock);
events_mask = hpriv->notifier_event.events_mask;
hpriv->notifier_event.events_mask = 0;
mutex_unlock(&hpriv->notifier_event.lock);
rc = copy_to_user(out, &events_mask, sizeof(u64));
return rc;
}
static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
@ -614,6 +633,43 @@ static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_
return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
}
static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
int rc;
/* check if there is already a registered on that process */
mutex_lock(&hpriv->notifier_event.lock);
if (hpriv->notifier_event.eventfd) {
mutex_unlock(&hpriv->notifier_event.lock);
return -EINVAL;
}
hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd);
if (IS_ERR(hpriv->notifier_event.eventfd)) {
rc = PTR_ERR(hpriv->notifier_event.eventfd);
hpriv->notifier_event.eventfd = 0;
mutex_unlock(&hpriv->notifier_event.lock);
return rc;
}
mutex_unlock(&hpriv->notifier_event.lock);
return 0;
}
static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
mutex_lock(&hpriv->notifier_event.lock);
if (!hpriv->notifier_event.eventfd) {
mutex_unlock(&hpriv->notifier_event.lock);
return -EINVAL;
}
eventfd_ctx_put(hpriv->notifier_event.eventfd);
hpriv->notifier_event.eventfd = 0;
mutex_unlock(&hpriv->notifier_event.lock);
return 0;
}
static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
struct device *dev)
{
@ -667,6 +723,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES:
return dev_mem_alloc_page_sizes_info(hpriv, args);
case HL_INFO_GET_EVENTS:
return events_info(hpriv, args);
default:
break;
}
@ -717,6 +776,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_DRAM_PENDING_ROWS:
return dram_pending_rows_info(hpriv, args);
case HL_INFO_REGISTER_EVENTFD:
return eventfd_register(hpriv, args);
case HL_INFO_UNREGISTER_EVENTFD:
return eventfd_unregister(hpriv, args);
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -EINVAL;

View File

@ -7879,7 +7879,6 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_MMU_PAGE_FAULT:
case GAUDI_EVENT_MMU_WR_PERM:
case GAUDI_EVENT_RAZWI_OR_ADC:
case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM:
case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM:
fallthrough;
@ -7899,6 +7898,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
hl_fw_unmask_irq(hdev, event_type);
break;
case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_qman_err(hdev, event_type);
hl_fw_unmask_irq(hdev, event_type);
/* In TPC QM event, notify on TPC assertion. While there isn't
* a specific event for assertion yet, the FW generates QM event.
* The SW upper layer will inspect an internal mapped area to indicate
* if the event is a tpc assertion or tpc QM.
*/
hl_notifier_event_send_all(hdev, HL_NOTIFIER_EVENT_TPC_ASSERT);
break;
case GAUDI_EVENT_RAZWI_OR_ADC_SW:
gaudi_print_irq_info(hdev, event_type, true);
goto reset_device;

View File

@ -349,6 +349,9 @@ enum hl_server_type {
* Razwi initiator.
* Razwi cause, was it a page fault or MMU access error.
* HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES - Retrieve valid page sizes for device memory allocation
* HL_INFO_REGISTER_EVENTFD - Register eventfd for event notifications.
* HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd
* HL_INFO_GET_EVENTS - Retrieve the last occurred events
*/
#define HL_INFO_HW_IP_INFO 0
#define HL_INFO_HW_EVENTS 1
@ -374,6 +377,9 @@ enum hl_server_type {
#define HL_INFO_CS_TIMEOUT_EVENT 24
#define HL_INFO_RAZWI_EVENT 25
#define HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES 26
#define HL_INFO_REGISTER_EVENTFD 28
#define HL_INFO_UNREGISTER_EVENTFD 29
#define HL_INFO_GET_EVENTS 30
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
@ -679,6 +685,7 @@ enum gaudi_dcores {
* @period_ms: Period value, in milliseconds, for utilization rate in range 100ms - 1000ms in 100 ms
* resolution. Currently not in use.
* @pll_index: Index as defined in hl_<asic type>_pll_index enumeration.
* @eventfd: event file descriptor for event notifications.
* @pad: Padding to 64 bit.
*/
struct hl_info_args {
@ -691,6 +698,7 @@ struct hl_info_args {
__u32 ctx_id;
__u32 period_ms;
__u32 pll_index;
__u32 eventfd;
};
__u32 pad;
@ -1390,6 +1398,13 @@ struct hl_debug_args {
__u32 ctx_id;
};
/*
* Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command
*
* HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event
*/
#define HL_NOTIFIER_EVENT_TPC_ASSERT (1 << 0)
/*
* Various information operations such as:
* - H/W IP information