habanalabs: refactor razwi event notification

This event notification was compatible only with gaudi, where razwi
and page fault happens together.

To make it compatible with all ASICs, this refactor contains:

1. Razwi notification will only notify about razwi info.
   New notification will be added in future patch, to retrieve data
   about page fault error.

2. Changed razwi info structure to support all ASICs.

Signed-off-by: Dani Liberman <dliberman@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Dani Liberman 2022-09-19 18:51:59 +03:00 committed by Oded Gabbay
parent ea73ef14dd
commit 52d5e54695
6 changed files with 88 additions and 84 deletions

View File

@ -2253,3 +2253,25 @@ inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
{
writel(val, hdev->rmmio + reg);
}
void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
u8 flags)
{
if (num_of_engines > HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR) {
dev_err(hdev->dev,
"Number of possible razwi initiators (%u) exceeded limit (%u)\n",
num_of_engines, HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR);
return;
}
/* In case it's the first razwi since the device was opened, capture its parameters */
if (atomic_cmpxchg(&hdev->captured_err_info.razwi_info_recorded, 0, 1))
return;
hdev->captured_err_info.razwi.timestamp = ktime_to_ns(ktime_get());
hdev->captured_err_info.razwi.addr = addr;
hdev->captured_err_info.razwi.num_of_possible_engines = num_of_engines;
memcpy(&hdev->captured_err_info.razwi.engine_id[0], &engine_id[0],
num_of_engines * sizeof(u16));
hdev->captured_err_info.razwi.flags = flags;
}

View File

@ -2925,30 +2925,6 @@ struct cs_timeout_info {
u64 seq;
};
/**
* struct razwi_info - info about last razwi error occurred.
* @timestamp: razwi timestamp.
* @write_enable: if set writing to razwi parameters in the structure is enabled.
* otherwise - disabled, so the first (root cause) razwi will not be overwritten.
* @addr: address that caused razwi.
* @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does
* not have engine id it will be set to U16_MAX.
* @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
* engines which one them caused the razwi. In that case, it will contain the
* second possible engine id, otherwise it will be set to U16_MAX.
* @non_engine_initiator: in case the initiator of the razwi does not have engine id.
* @type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
*/
struct razwi_info {
ktime_t timestamp;
atomic_t write_enable;
u64 addr;
u16 engine_id_1;
u16 engine_id_2;
u8 non_engine_initiator;
u8 type;
};
#define MAX_QMAN_STREAMS_INFO 4
#define OPCODE_INFO_MAX_ADDR_SIZE 8
/**
@ -2985,11 +2961,14 @@ struct undefined_opcode_info {
* struct hl_error_info - holds information collected during an error.
* @cs_timeout: CS timeout error information.
* @razwi: razwi information.
* @razwi_info_recorded: if set writing to razwi information is enabled.
* otherwise - disabled, so the first (root cause) razwi will not be overwritten.
* @undef_opcode: undefined opcode information
*/
struct hl_error_info {
struct cs_timeout_info cs_timeout;
struct razwi_info razwi;
struct hl_info_razwi_event razwi;
atomic_t razwi_info_recorded;
struct undefined_opcode_info undef_opcode;
};
@ -3800,6 +3779,8 @@ hl_mmap_mem_buf_alloc(struct hl_mem_mgr *mmg,
struct hl_mmap_mem_buf_behavior *behavior, gfp_t gfp,
void *args);
__printf(2, 3) void hl_engine_data_sprintf(struct engines_data *e, const char *fmt, ...);
void hl_capture_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_of_engines,
u8 flags);
#ifdef CONFIG_DEBUG_FS

View File

@ -212,7 +212,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
hl_debugfs_add_file(hpriv);
atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
atomic_set(&hdev->captured_err_info.razwi.write_enable, 1);
atomic_set(&hdev->captured_err_info.razwi_info_recorded, 0);
hdev->captured_err_info.undef_opcode.write_enable = true;
hdev->open_counter++;

View File

@ -603,20 +603,14 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
struct hl_device *hdev = hpriv->hdev;
u32 max_size = args->return_size;
struct hl_info_razwi_event info = {0};
struct hl_info_razwi_event *info = &hdev->captured_err_info.razwi;
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
if ((!max_size) || (!out))
return -EINVAL;
info.timestamp = ktime_to_ns(hdev->captured_err_info.razwi.timestamp);
info.addr = hdev->captured_err_info.razwi.addr;
info.engine_id_1 = hdev->captured_err_info.razwi.engine_id_1;
info.engine_id_2 = hdev->captured_err_info.razwi.engine_id_2;
info.no_engine_id = hdev->captured_err_info.razwi.non_engine_initiator;
info.error_type = hdev->captured_err_info.razwi.type;
return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
return copy_to_user(out, info, min_t(size_t, max_size, sizeof(struct hl_info_razwi_event)))
? -EFAULT : 0;
}
static int undefined_opcode_info(struct hl_fpriv *hpriv, struct hl_info_args *args)

View File

@ -6505,8 +6505,8 @@ static void gaudi_get_event_desc(u16 event_type, char *desc, size_t size)
}
static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y,
bool is_write, s32 *engine_id_1,
s32 *engine_id_2)
bool is_write, u16 *engine_id_1,
u16 *engine_id_2)
{
u32 dma_id[2], dma_offset, err_cause[2], mask, i;
@ -6603,7 +6603,7 @@ static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u3
}
static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write,
u32 *engine_id_1, u32 *engine_id_2)
u16 *engine_id_1, u16 *engine_id_2)
{
u32 val, x_y, axi_id;
@ -6719,8 +6719,8 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool i
return "unknown initiator";
}
static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1,
u32 *engine_id_2)
static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u16 *engine_id_1,
u16 *engine_id_2, bool *is_read, bool *is_write)
{
if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) {
@ -6728,6 +6728,7 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i
"RAZWI event caused by illegal write of %s\n",
gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0);
*is_write = true;
}
if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) {
@ -6735,10 +6736,11 @@ static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_i
"RAZWI event caused by illegal read of %s\n",
gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_READ_VLD, 0);
*is_read = true;
}
}
static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type)
static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr)
{
struct gaudi_device *gaudi = hdev->asic_specific;
u32 val;
@ -6753,8 +6755,6 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr
*addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
*type = HL_RAZWI_PAGE_FAULT;
WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
}
@ -6765,7 +6765,6 @@ static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr
*addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr);
*type = HL_RAZWI_MMU_ACCESS_ERROR;
WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0);
}
@ -7302,46 +7301,41 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
bool razwi)
{
u32 engine_id_1, engine_id_2;
bool is_read = false, is_write = false;
u16 engine_id[2], num_of_razwi_eng = 0;
char desc[64] = "";
u64 razwi_addr = 0;
u8 razwi_type;
int rc;
u8 razwi_flags = 0;
/*
* Init engine id by default as not valid and only if razwi initiated from engine with
* engine id it will get valid value.
* Init razwi type to default, will be changed only if razwi caused by page fault of
* MMU access error
*/
engine_id_1 = U16_MAX;
engine_id_2 = U16_MAX;
razwi_type = U8_MAX;
engine_id[0] = HL_RAZWI_NA_ENG_ID;
engine_id[1] = HL_RAZWI_NA_ENG_ID;
gaudi_get_event_desc(event_type, desc, sizeof(desc));
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
if (razwi) {
gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2);
gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type);
gaudi_print_and_get_razwi_info(hdev, &engine_id[0], &engine_id[1], &is_read,
&is_write);
gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr);
/* In case it's the first razwi, save its parameters*/
rc = atomic_cmpxchg(&hdev->captured_err_info.razwi.write_enable, 1, 0);
if (rc) {
hdev->captured_err_info.razwi.timestamp = ktime_get();
hdev->captured_err_info.razwi.addr = razwi_addr;
hdev->captured_err_info.razwi.engine_id_1 = engine_id_1;
hdev->captured_err_info.razwi.engine_id_2 = engine_id_2;
/*
* If first engine id holds non valid value the razwi initiator
* does not have engine id
*/
hdev->captured_err_info.razwi.non_engine_initiator =
(engine_id_1 == U16_MAX);
hdev->captured_err_info.razwi.type = razwi_type;
if (is_read)
razwi_flags |= HL_RAZWI_READ;
if (is_write)
razwi_flags |= HL_RAZWI_WRITE;
if (engine_id[0] != HL_RAZWI_NA_ENG_ID) {
if (engine_id[1] != HL_RAZWI_NA_ENG_ID)
num_of_razwi_eng = 2;
else
num_of_razwi_eng = 1;
}
hl_capture_razwi(hdev, razwi_addr, engine_id, num_of_razwi_eng, razwi_flags);
}
}

View File

@ -1071,31 +1071,44 @@ struct hl_info_cs_timeout_event {
__u64 seq;
};
#define HL_RAZWI_PAGE_FAULT 0
#define HL_RAZWI_MMU_ACCESS_ERROR 1
#define HL_RAZWI_NA_ENG_ID U16_MAX
#define HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR 128
#define HL_RAZWI_READ BIT(0)
#define HL_RAZWI_WRITE BIT(1)
#define HL_RAZWI_LBW BIT(2)
#define HL_RAZWI_HBW BIT(3)
#define HL_RAZWI_RR BIT(4)
#define HL_RAZWI_ADDR_DEC BIT(5)
/**
* struct hl_info_razwi_event - razwi information.
* @timestamp: timestamp of razwi.
* @addr: address which accessing it caused razwi.
* @engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does not
* have engine id it will be set to U16_MAX.
* @engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
* engines which one them caused the razwi. In that case, it will contain the
* second possible engine id, otherwise it will be set to U16_MAX.
* @no_engine_id: if razwi initiator does not have engine id, this field will be set to 1,
* otherwise 0.
* @error_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
* @pad: padding to 64 bit.
* @engine_id: engine id of the razwi initiator, if it was initiated by engine that does not
* have engine id it will be set to HL_RAZWI_NA_ENG_ID. If there are several possible
* engines which caused the razwi, it will hold all of them.
* @num_of_possible_engines: contains number of possible engine ids. In some asics, razwi indication
* might be common for several engines and there is no way to get the
* exact engine. In this way, engine_id array will be filled with all
* possible engines caused this razwi. Also, there might be possibility
* in gaudi, where we don't indication on specific engine, in that case
* the value of this parameter will be zero.
* @flags: bitmask for additional data: HL_RAZWI_READ - razwi caused by read operation
* HL_RAZWI_WRITE - razwi caused by write operation
* HL_RAZWI_LBW - razwi caused by lbw fabric transaction
* HL_RAZWI_HBW - razwi caused by hbw fabric transaction
* HL_RAZWI_RR - razwi caused by range register
* HL_RAZWI_ADDR_DEC - razwi caused by address decode error
* Note: this data is not supported by all asics, in that case the relevant bits will not
* be set.
*/
struct hl_info_razwi_event {
__s64 timestamp;
__u64 addr;
__u16 engine_id_1;
__u16 engine_id_2;
__u8 no_engine_id;
__u8 error_type;
__u8 pad[2];
__u16 engine_id[HL_RAZWI_MAX_NUM_OF_ENGINES_PER_RTR];
__u16 num_of_possible_engines;
__u8 flags;
__u8 pad[5];
};
#define MAX_QMAN_STREAMS_INFO 4