mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
synced 2025-01-17 18:36:00 +00:00
RDMA/hns: Recover 1bit-ECC error of RAM on chip
Since ECC memory maintains a memory system immune to single-bit errors, add support for correcting the 1bit-ECC error, which prevents a 1bit-ECC error become an uncorrected type error. When a 1bit-ECC error happens in the internal ram of the ROCE engine, such as the QPC table, as a 1bit-ECC error caused by reading, the ROCE engine only corrects those 1bit ECC errors by writing. Link: https://lore.kernel.org/r/20220714134353.16700-6-liangwenpeng@huawei.com Signed-off-by: Haoyue Xu <xuhaoyue1@hisilicon.com> Signed-off-by: Wenpeng Liang <liangwenpeng@huawei.com> Signed-off-by: Leon Romanovsky <leon@kernel.org>
This commit is contained in:
parent
75e4e716f7
commit
2de949abd6
@ -959,6 +959,7 @@ struct hns_roce_dev {
|
||||
const struct hns_roce_hw *hw;
|
||||
void *priv;
|
||||
struct workqueue_struct *irq_workq;
|
||||
struct work_struct ecc_work;
|
||||
const struct hns_roce_dfx_hw *dfx;
|
||||
u32 func_num;
|
||||
u32 is_vf;
|
||||
|
@ -55,6 +55,42 @@ enum {
|
||||
CMD_RST_PRC_EBUSY,
|
||||
};
|
||||
|
||||
enum ecc_resource_type {
|
||||
ECC_RESOURCE_QPC,
|
||||
ECC_RESOURCE_CQC,
|
||||
ECC_RESOURCE_MPT,
|
||||
ECC_RESOURCE_SRQC,
|
||||
ECC_RESOURCE_GMV,
|
||||
ECC_RESOURCE_QPC_TIMER,
|
||||
ECC_RESOURCE_CQC_TIMER,
|
||||
ECC_RESOURCE_SCCC,
|
||||
ECC_RESOURCE_COUNT,
|
||||
};
|
||||
|
||||
static const struct {
|
||||
const char *name;
|
||||
u8 read_bt0_op;
|
||||
u8 write_bt0_op;
|
||||
} fmea_ram_res[] = {
|
||||
{ "ECC_RESOURCE_QPC",
|
||||
HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 },
|
||||
{ "ECC_RESOURCE_CQC",
|
||||
HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 },
|
||||
{ "ECC_RESOURCE_MPT",
|
||||
HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 },
|
||||
{ "ECC_RESOURCE_SRQC",
|
||||
HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 },
|
||||
/* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */
|
||||
{ "ECC_RESOURCE_GMV",
|
||||
0, 0 },
|
||||
{ "ECC_RESOURCE_QPC_TIMER",
|
||||
HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 },
|
||||
{ "ECC_RESOURCE_CQC_TIMER",
|
||||
HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 },
|
||||
{ "ECC_RESOURCE_SCCC",
|
||||
HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 },
|
||||
};
|
||||
|
||||
static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg,
|
||||
struct ib_sge *sg)
|
||||
{
|
||||
@ -6017,6 +6053,142 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev,
|
||||
return IRQ_RETVAL(int_work);
|
||||
}
|
||||
|
||||
static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev,
|
||||
struct fmea_ram_ecc *ecc_info)
|
||||
{
|
||||
struct hns_roce_cmq_desc desc;
|
||||
struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
|
||||
int ret;
|
||||
|
||||
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true);
|
||||
ret = hns_roce_cmq_send(hr_dev, &desc, 1);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR);
|
||||
ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE);
|
||||
ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx)
|
||||
{
|
||||
struct hns_roce_cmq_desc desc;
|
||||
struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data;
|
||||
u32 addr_upper;
|
||||
u32 addr_low;
|
||||
int ret;
|
||||
|
||||
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true);
|
||||
hr_reg_write(req, CFG_GMV_BT_IDX, idx);
|
||||
|
||||
ret = hns_roce_cmq_send(hr_dev, &desc, 1);
|
||||
if (ret) {
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to execute cmd to read gmv, ret = %d.\n", ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L);
|
||||
addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H);
|
||||
|
||||
hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false);
|
||||
hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low);
|
||||
hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper);
|
||||
hr_reg_write(req, CFG_GMV_BT_IDX, idx);
|
||||
|
||||
return hns_roce_cmq_send(hr_dev, &desc, 1);
|
||||
}
|
||||
|
||||
static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data)
|
||||
{
|
||||
if (res_type == ECC_RESOURCE_QPC_TIMER ||
|
||||
res_type == ECC_RESOURCE_CQC_TIMER ||
|
||||
res_type == ECC_RESOURCE_SCCC)
|
||||
return le64_to_cpu(*data);
|
||||
|
||||
return le64_to_cpu(*data) << PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type,
|
||||
u32 index)
|
||||
{
|
||||
u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op;
|
||||
u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op;
|
||||
struct hns_roce_cmd_mailbox *mailbox;
|
||||
u64 addr;
|
||||
int ret;
|
||||
|
||||
mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
|
||||
if (IS_ERR(mailbox))
|
||||
return PTR_ERR(mailbox);
|
||||
|
||||
ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index);
|
||||
if (ret) {
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to execute cmd to read fmea ram, ret = %d.\n",
|
||||
ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
addr = fmea_get_ram_res_addr(res_type, mailbox->buf);
|
||||
|
||||
ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index);
|
||||
if (ret)
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to execute cmd to write fmea ram, ret = %d.\n",
|
||||
ret);
|
||||
|
||||
out:
|
||||
hns_roce_free_cmd_mailbox(hr_dev, mailbox);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev,
|
||||
struct fmea_ram_ecc *ecc_info)
|
||||
{
|
||||
u32 res_type = ecc_info->res_type;
|
||||
u32 index = ecc_info->index;
|
||||
int ret;
|
||||
|
||||
BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT);
|
||||
|
||||
if (res_type >= ECC_RESOURCE_COUNT) {
|
||||
dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n",
|
||||
res_type);
|
||||
return;
|
||||
}
|
||||
|
||||
if (res_type == ECC_RESOURCE_GMV)
|
||||
ret = fmea_recover_gmv(hr_dev, index);
|
||||
else
|
||||
ret = fmea_recover_others(hr_dev, res_type, index);
|
||||
if (ret)
|
||||
dev_err(hr_dev->dev,
|
||||
"failed to recover %s, index = %u, ret = %d.\n",
|
||||
fmea_ram_res[res_type].name, index, ret);
|
||||
}
|
||||
|
||||
static void fmea_ram_ecc_work(struct work_struct *ecc_work)
|
||||
{
|
||||
struct hns_roce_dev *hr_dev =
|
||||
container_of(ecc_work, struct hns_roce_dev, ecc_work);
|
||||
struct fmea_ram_ecc ecc_info = {};
|
||||
|
||||
if (fmea_ram_ecc_query(hr_dev, &ecc_info)) {
|
||||
dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
if (!ecc_info.is_ecc_err) {
|
||||
dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n");
|
||||
return;
|
||||
}
|
||||
|
||||
fmea_ram_ecc_recover(hr_dev, &ecc_info);
|
||||
}
|
||||
|
||||
static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
|
||||
{
|
||||
struct hns_roce_dev *hr_dev = dev_id;
|
||||
@ -6025,10 +6197,14 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id)
|
||||
|
||||
int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG);
|
||||
|
||||
if (int_st)
|
||||
if (int_st) {
|
||||
int_work = abnormal_interrupt_basic(hr_dev, int_st);
|
||||
else
|
||||
} else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) {
|
||||
queue_work(hr_dev->irq_workq, &hr_dev->ecc_work);
|
||||
int_work = IRQ_HANDLED;
|
||||
} else {
|
||||
dev_err(hr_dev->dev, "there is no abnormal irq found.\n");
|
||||
}
|
||||
|
||||
return IRQ_RETVAL(int_work);
|
||||
}
|
||||
@ -6344,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev)
|
||||
}
|
||||
}
|
||||
|
||||
INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work);
|
||||
|
||||
hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0);
|
||||
if (!hr_dev->irq_workq) {
|
||||
dev_err(dev, "failed to create irq workqueue.\n");
|
||||
|
@ -250,6 +250,7 @@ enum hns_roce_opcode_type {
|
||||
HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f,
|
||||
HNS_ROCE_OPC_CFG_GMV_BT = 0x8510,
|
||||
HNS_ROCE_OPC_EXT_CFG = 0x8512,
|
||||
HNS_ROCE_QUERY_RAM_ECC = 0x8513,
|
||||
HNS_SWITCH_PARAMETER_CFG = 0x1033,
|
||||
};
|
||||
|
||||
@ -1107,6 +1108,11 @@ enum {
|
||||
#define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32)
|
||||
#define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64)
|
||||
|
||||
/* Fields of HNS_ROCE_QUERY_RAM_ECC */
|
||||
#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0)
|
||||
#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32)
|
||||
#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64)
|
||||
|
||||
struct hns_roce_cfg_sgid_tb {
|
||||
__le32 table_idx_rsv;
|
||||
__le32 vf_sgid_l;
|
||||
@ -1343,6 +1349,12 @@ struct hns_roce_dip {
|
||||
struct list_head node; /* all dips are on a list */
|
||||
};
|
||||
|
||||
struct fmea_ram_ecc {
|
||||
u32 is_ecc_err;
|
||||
u32 res_type;
|
||||
u32 index;
|
||||
};
|
||||
|
||||
/* only for RNR timeout issue of HIP08 */
|
||||
#define HNS_ROCE_CLOCK_ADJUST 1000
|
||||
#define HNS_ROCE_MAX_CQ_PERIOD 65
|
||||
|
Loading…
x
Reference in New Issue
Block a user