Merge branch 'pci-device-recovery' into features

Niklas Schnelle says:

===================
This patch series enhances the introspectability of the PCI device
recovery for firmware. Until now when Linux performs recovery in
response to a firmware error report. For example, until now firmware
debug data would have no indication if the recovery was successfull or
if it failed, for example due to KVM pass-through.

Improve on this by reporting recovery status as well as some debug
information such as device driver name and s390dbf/pci_msg/sprintf logs
via the SCLP Write Event Data Action Qualifier 2 (Log Data provided)
mechanism.
===================

Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
This commit is contained in:
Alexander Gordeev 2024-12-18 16:06:24 +01:00
commit 3ace3c4214
9 changed files with 428 additions and 83 deletions

View File

@ -85,6 +85,10 @@ int debug_dflt_header_fn(debug_info_t *id, struct debug_view *view,
int area, debug_entry_t *entry,
char *out_buf, size_t out_buf_size);
#define DEBUG_SPRINTF_MAX_ARGS 10
int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, size_t out_buf_size,
const char *inbuf);
struct debug_view {
char name[DEBUG_MAX_NAME_LEN];
debug_prolog_proc_t *prolog_proc;
@ -114,6 +118,9 @@ debug_info_t *debug_register_mode(const char *name, int pages, int nr_areas,
int buf_size, umode_t mode, uid_t uid,
gid_t gid);
ssize_t debug_dump(debug_info_t *id, struct debug_view *view,
char *buf, size_t buf_size, bool reverse);
void debug_unregister(debug_info_t *id);
void debug_set_level(debug_info_t *id, int new_level);

View File

@ -16,6 +16,11 @@
/* 24 + 16 * SCLP_MAX_CORES */
#define EXT_SCCB_READ_CPU (3 * PAGE_SIZE)
#define SCLP_ERRNOTIFY_AQ_RESET 0
#define SCLP_ERRNOTIFY_AQ_REPAIR 1
#define SCLP_ERRNOTIFY_AQ_INFO_LOG 2
#define SCLP_ERRNOTIFY_AQ_OPTICS_DATA 3
#ifndef __ASSEMBLY__
#include <linux/uio.h>
#include <asm/chpid.h>
@ -112,6 +117,34 @@ struct sclp_info {
};
extern struct sclp_info sclp;
struct sccb_header {
u16 length;
u8 function_code;
u8 control_mask[3];
u16 response_code;
} __packed;
struct evbuf_header {
u16 length;
u8 type;
u8 flags;
u16 _reserved;
} __packed;
struct err_notify_evbuf {
struct evbuf_header header;
u8 action;
u8 atype;
u32 fh;
u32 fid;
u8 data[];
} __packed;
struct err_notify_sccb {
struct sccb_header header;
struct err_notify_evbuf evbuf;
} __packed;
struct zpci_report_error_header {
u8 version; /* Interface version byte */
u8 action; /* Action qualifier byte

View File

@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/math.h>
#include <linux/minmax.h>
#include <linux/debugfs.h>
@ -94,9 +95,6 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view,
static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, size_t out_buf_size,
const char *in_buf);
static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, size_t out_buf_size,
const char *inbuf);
static void debug_areas_swap(debug_info_t *a, debug_info_t *b);
static void debug_events_append(debug_info_t *dest, debug_info_t *src);
@ -354,7 +352,10 @@ static debug_info_t *debug_info_copy(debug_info_t *in, int mode)
for (i = 0; i < in->nr_areas; i++) {
for (j = 0; j < in->pages_per_area; j++)
memcpy(rc->areas[i][j], in->areas[i][j], PAGE_SIZE);
rc->active_pages[i] = in->active_pages[i];
rc->active_entries[i] = in->active_entries[i];
}
rc->active_area = in->active_area;
out:
spin_unlock_irqrestore(&in->lock, flags);
return rc;
@ -422,11 +423,17 @@ static int debug_format_entry(file_private_info_t *p_info)
return len;
}
/*
* debug_next_entry:
* - goto next entry in p_info
/**
* debug_next_entry - Go to the next entry
* @p_info: Private info that is manipulated
*
* Sets the current position in @p_info to the next entry. If no further entry
* exists the current position is set to one after the end the return value
* indicates that no further entries exist.
*
* Return: True if there are more following entries, false otherwise
*/
static inline int debug_next_entry(file_private_info_t *p_info)
static inline bool debug_next_entry(file_private_info_t *p_info)
{
debug_info_t *id;
@ -434,10 +441,10 @@ static inline int debug_next_entry(file_private_info_t *p_info)
if (p_info->act_entry == DEBUG_PROLOG_ENTRY) {
p_info->act_entry = 0;
p_info->act_page = 0;
goto out;
return true;
}
if (!id->areas)
return 1;
return false;
p_info->act_entry += id->entry_size;
/* switch to next page, if we reached the end of the page */
if (p_info->act_entry > (PAGE_SIZE - id->entry_size)) {
@ -450,10 +457,87 @@ static inline int debug_next_entry(file_private_info_t *p_info)
p_info->act_page = 0;
}
if (p_info->act_area >= id->nr_areas)
return 1;
return false;
}
out:
return 0;
return true;
}
/**
* debug_to_act_entry - Go to the currently active entry
* @p_info: Private info that is manipulated
*
* Sets the current position in @p_info to the currently active
* entry of @p_info->debug_info_snap
*/
static void debug_to_act_entry(file_private_info_t *p_info)
{
debug_info_t *snap_id;
snap_id = p_info->debug_info_snap;
p_info->act_area = snap_id->active_area;
p_info->act_page = snap_id->active_pages[snap_id->active_area];
p_info->act_entry = snap_id->active_entries[snap_id->active_area];
}
/**
* debug_prev_entry - Go to the previous entry
* @p_info: Private info that is manipulated
*
* Sets the current position in @p_info to the previous entry. If no previous entry
* exists the current position is set left as DEBUG_PROLOG_ENTRY and the return value
* indicates that no previous entries exist.
*
* Return: True if there are more previous entries, false otherwise
*/
static inline bool debug_prev_entry(file_private_info_t *p_info)
{
debug_info_t *id;
id = p_info->debug_info_snap;
if (p_info->act_entry == DEBUG_PROLOG_ENTRY)
debug_to_act_entry(p_info);
if (!id->areas)
return false;
p_info->act_entry -= id->entry_size;
/* switch to prev page, if we reached the beginning of the page */
if (p_info->act_entry < 0) {
/* end of previous page */
p_info->act_entry = rounddown(PAGE_SIZE, id->entry_size) - id->entry_size;
p_info->act_page--;
if (p_info->act_page < 0) {
/* previous area */
p_info->act_area--;
p_info->act_page = id->pages_per_area - 1;
}
if (p_info->act_area < 0)
p_info->act_area = (id->nr_areas - 1) % id->nr_areas;
}
/* check full circle */
if (id->active_area == p_info->act_area &&
id->active_pages[id->active_area] == p_info->act_page &&
id->active_entries[id->active_area] == p_info->act_entry)
return false;
return true;
}
/**
* debug_move_entry - Go to next entry in either the forward or backward direction
* @p_info: Private info that is manipulated
* @reverse: If true go to the next entry in reverse i.e. previous
*
* Sets the current position in @p_info to the next (@reverse == false) or
* previous (@reverse == true) entry.
*
* Return: True if there are further entries in that direction,
* false otherwise.
*/
static bool debug_move_entry(file_private_info_t *p_info, bool reverse)
{
if (reverse)
return debug_prev_entry(p_info);
else
return debug_next_entry(p_info);
}
/*
@ -495,7 +579,7 @@ static ssize_t debug_output(struct file *file, /* file descriptor */
}
if (copy_size == formatted_line_residue) {
entry_offset = 0;
if (debug_next_entry(p_info))
if (!debug_next_entry(p_info))
goto out;
}
}
@ -530,6 +614,42 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf,
return rc; /* number of input characters */
}
static file_private_info_t *debug_file_private_alloc(debug_info_t *debug_info,
struct debug_view *view)
{
debug_info_t *debug_info_snapshot;
file_private_info_t *p_info;
/*
* Make snapshot of current debug areas to get it consistent.
* To copy all the areas is only needed, if we have a view which
* formats the debug areas.
*/
if (!view->format_proc && !view->header_proc)
debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS);
else
debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS);
if (!debug_info_snapshot)
return NULL;
p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL);
if (!p_info) {
debug_info_free(debug_info_snapshot);
return NULL;
}
p_info->offset = 0;
p_info->debug_info_snap = debug_info_snapshot;
p_info->debug_info_org = debug_info;
p_info->view = view;
p_info->act_area = 0;
p_info->act_page = 0;
p_info->act_entry = DEBUG_PROLOG_ENTRY;
p_info->act_entry_offset = 0;
debug_info_get(debug_info);
return p_info;
}
/*
* debug_open:
* - called for user open()
@ -538,7 +658,7 @@ static ssize_t debug_input(struct file *file, const char __user *user_buf,
*/
static int debug_open(struct inode *inode, struct file *file)
{
debug_info_t *debug_info, *debug_info_snapshot;
debug_info_t *debug_info;
file_private_info_t *p_info;
int i, rc = 0;
@ -556,42 +676,26 @@ static int debug_open(struct inode *inode, struct file *file)
goto out;
found:
/* Make snapshot of current debug areas to get it consistent. */
/* To copy all the areas is only needed, if we have a view which */
/* formats the debug areas. */
if (!debug_info->views[i]->format_proc && !debug_info->views[i]->header_proc)
debug_info_snapshot = debug_info_copy(debug_info, NO_AREAS);
else
debug_info_snapshot = debug_info_copy(debug_info, ALL_AREAS);
if (!debug_info_snapshot) {
rc = -ENOMEM;
goto out;
}
p_info = kmalloc(sizeof(file_private_info_t), GFP_KERNEL);
p_info = debug_file_private_alloc(debug_info, debug_info->views[i]);
if (!p_info) {
debug_info_free(debug_info_snapshot);
rc = -ENOMEM;
goto out;
}
p_info->offset = 0;
p_info->debug_info_snap = debug_info_snapshot;
p_info->debug_info_org = debug_info;
p_info->view = debug_info->views[i];
p_info->act_area = 0;
p_info->act_page = 0;
p_info->act_entry = DEBUG_PROLOG_ENTRY;
p_info->act_entry_offset = 0;
file->private_data = p_info;
debug_info_get(debug_info);
nonseekable_open(inode, file);
out:
mutex_unlock(&debug_mutex);
return rc;
}
static void debug_file_private_free(file_private_info_t *p_info)
{
if (p_info->debug_info_snap)
debug_info_free(p_info->debug_info_snap);
debug_info_put(p_info->debug_info_org);
kfree(p_info);
}
/*
* debug_close:
* - called for user close()
@ -602,13 +706,59 @@ static int debug_close(struct inode *inode, struct file *file)
file_private_info_t *p_info;
p_info = (file_private_info_t *) file->private_data;
if (p_info->debug_info_snap)
debug_info_free(p_info->debug_info_snap);
debug_info_put(p_info->debug_info_org);
kfree(file->private_data);
debug_file_private_free(p_info);
file->private_data = NULL;
return 0; /* success */
}
/**
* debug_dump - Get a textual representation of debug info, or as much as fits
* @id: Debug information to use
* @view: View with which to dump the debug information
* @buf: Buffer the textual debug data representation is written to
* @buf_size: Size of the buffer, including the trailing '\0' byte
* @reverse: Go backwards from the last written entry
*
* This function may be used whenever a textual representation of the debug
* information is required without using an s390dbf file.
*
* Note: It is the callers responsibility to supply a view that is compatible
* with the debug information data.
*
* Return: On success returns the number of bytes written to the buffer not
* including the trailing '\0' byte. If bug_size == 0 the function returns 0.
* On failure an error code less than 0 is returned.
*/
ssize_t debug_dump(debug_info_t *id, struct debug_view *view,
char *buf, size_t buf_size, bool reverse)
{
file_private_info_t *p_info;
size_t size, offset = 0;
/* Need space for '\0' byte */
if (buf_size < 1)
return 0;
buf_size--;
p_info = debug_file_private_alloc(id, view);
if (!p_info)
return -ENOMEM;
/* There is always at least the DEBUG_PROLOG_ENTRY */
do {
size = debug_format_entry(p_info);
size = min(size, buf_size - offset);
memcpy(buf + offset, p_info->temp_buf, size);
offset += size;
if (offset >= buf_size)
break;
} while (debug_move_entry(p_info, reverse));
debug_file_private_free(p_info);
buf[offset] = '\0';
return offset;
}
/* Create debugfs entries and add to internal list. */
static void _debug_register(debug_info_t *id)
{
@ -1532,7 +1682,7 @@ EXPORT_SYMBOL(debug_dflt_header_fn);
#define DEBUG_SPRINTF_MAX_ARGS 10
static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
char *out_buf, size_t out_buf_size, const char *inbuf)
{
debug_sprintf_entry_t *curr_event = (debug_sprintf_entry_t *)inbuf;
@ -1570,6 +1720,7 @@ static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
out:
return rc;
}
EXPORT_SYMBOL(debug_sprintf_format_fn);
/*
* debug_init:

View File

@ -5,6 +5,6 @@
obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_clp.o \
pci_event.o pci_debug.o pci_insn.o pci_mmio.o \
pci_bus.o pci_kvm_hook.o
pci_bus.o pci_kvm_hook.o pci_report.o
obj-$(CONFIG_PCI_IOV) += pci_iov.o
obj-$(CONFIG_SYSFS) += pci_sysfs.o

View File

@ -16,6 +16,7 @@
#include <asm/sclp.h>
#include "pci_bus.h"
#include "pci_report.h"
/* Content Code Description for PCI Function Error */
struct zpci_ccdf_err {
@ -169,6 +170,8 @@ static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev,
static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
{
pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT;
struct zpci_dev *zdev = to_zpci(pdev);
char *status_str = "success";
struct pci_driver *driver;
/*
@ -186,30 +189,38 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
if (is_passed_through(pdev)) {
pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n",
pci_name(pdev));
status_str = "failed (pass-through)";
goto out_unlock;
}
driver = to_pci_driver(pdev->dev.driver);
if (!is_driver_supported(driver)) {
if (!driver)
if (!driver) {
pr_info("%s: Cannot be recovered because no driver is bound to the device\n",
pci_name(pdev));
else
status_str = "failed (no driver)";
} else {
pr_info("%s: The %s driver bound to the device does not support error recovery\n",
pci_name(pdev),
driver->name);
status_str = "failed (no driver support)";
}
goto out_unlock;
}
ers_res = zpci_event_notify_error_detected(pdev, driver);
if (ers_result_indicates_abort(ers_res))
if (ers_result_indicates_abort(ers_res)) {
status_str = "failed (abort on detection)";
goto out_unlock;
}
if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) {
ers_res = zpci_event_do_error_state_clear(pdev, driver);
if (ers_result_indicates_abort(ers_res))
if (ers_result_indicates_abort(ers_res)) {
status_str = "failed (abort on MMIO enable)";
goto out_unlock;
}
}
if (ers_res == PCI_ERS_RESULT_NEED_RESET)
ers_res = zpci_event_do_reset(pdev, driver);
@ -217,6 +228,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
if (ers_res != PCI_ERS_RESULT_RECOVERED) {
pr_err("%s: Automatic recovery failed; operator intervention is required\n",
pci_name(pdev));
status_str = "failed (driver can't recover)";
goto out_unlock;
}
@ -225,6 +237,7 @@ static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev)
driver->err_handler->resume(pdev);
out_unlock:
pci_dev_unlock(pdev);
zpci_report_status(zdev, "recovery", status_str);
return ers_res;
}

158
arch/s390/pci/pci_report.c Normal file
View File

@ -0,0 +1,158 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2024
*
* Author(s):
* Niklas Schnelle <schnelle@linux.ibm.com>
*
*/
#define KMSG_COMPONENT "zpci"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
#include <linux/kernel.h>
#include <linux/sprintf.h>
#include <linux/pci.h>
#include <asm/sclp.h>
#include <asm/debug.h>
#include <asm/pci_debug.h>
#include "pci_report.h"
#define ZPCI_ERR_LOG_ID_KERNEL_REPORT 0x4714
struct zpci_report_error_data {
u64 timestamp;
u64 err_log_id;
char log_data[];
} __packed;
#define ZPCI_REPORT_SIZE (PAGE_SIZE - sizeof(struct err_notify_sccb))
#define ZPCI_REPORT_DATA_SIZE (ZPCI_REPORT_SIZE - sizeof(struct zpci_report_error_data))
struct zpci_report_error {
struct zpci_report_error_header header;
struct zpci_report_error_data data;
} __packed;
static const char *zpci_state_str(pci_channel_state_t state)
{
switch (state) {
case pci_channel_io_normal:
return "normal";
case pci_channel_io_frozen:
return "frozen";
case pci_channel_io_perm_failure:
return "permanent-failure";
default:
return "invalid";
};
}
static int debug_log_header_fn(debug_info_t *id, struct debug_view *view,
int area, debug_entry_t *entry, char *out_buf,
size_t out_buf_size)
{
unsigned long sec, usec;
unsigned int level;
char *except_str;
int rc = 0;
level = entry->level;
sec = entry->clock;
usec = do_div(sec, USEC_PER_SEC);
if (entry->exception)
except_str = "*";
else
except_str = "-";
rc += scnprintf(out_buf, out_buf_size, "%011ld:%06lu %1u %1s %04u ",
sec, usec, level, except_str,
entry->cpu);
return rc;
}
static int debug_prolog_header(debug_info_t *id, struct debug_view *view,
char *out_buf, size_t out_buf_size)
{
return scnprintf(out_buf, out_buf_size, "sec:usec level except cpu msg\n");
}
static struct debug_view debug_log_view = {
"pci_msg_log",
&debug_prolog_header,
&debug_log_header_fn,
&debug_sprintf_format_fn,
NULL,
NULL
};
/**
* zpci_report_status - Report the status of operations on a PCI device
* @zdev: The PCI device for which to report status
* @operation: A string representing the operation reported
* @status: A string representing the status of the operation
*
* This function creates a human readable report about an operation such as
* PCI device recovery and forwards this to the platform using the SCLP Write
* Event Data mechanism. Besides the operation and status strings the report
* also contains additional information about the device deemed useful for
* debug such as the currently bound device driver, if any, and error state.
* Additionally a string representation of pci_debug_msg_id, or as much as fits,
* is also included.
*
* Return: 0 on success an error code < 0 otherwise.
*/
int zpci_report_status(struct zpci_dev *zdev, const char *operation, const char *status)
{
struct zpci_report_error *report;
struct pci_driver *driver = NULL;
struct pci_dev *pdev = NULL;
char *buf, *end;
int ret;
if (!zdev || !zdev->zbus)
return -ENODEV;
/* Protected virtualization hosts get nothing from us */
if (prot_virt_guest)
return -ENODATA;
report = (void *)get_zeroed_page(GFP_KERNEL);
if (!report)
return -ENOMEM;
if (zdev->zbus->bus)
pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn);
if (pdev)
driver = to_pci_driver(pdev->dev.driver);
buf = report->data.log_data;
end = report->data.log_data + ZPCI_REPORT_DATA_SIZE;
buf += scnprintf(buf, end - buf, "report: %s\n", operation);
buf += scnprintf(buf, end - buf, "status: %s\n", status);
buf += scnprintf(buf, end - buf, "state: %s\n",
(pdev) ? zpci_state_str(pdev->error_state) : "n/a");
buf += scnprintf(buf, end - buf, "driver: %s\n", (driver) ? driver->name : "n/a");
ret = debug_dump(pci_debug_msg_id, &debug_log_view, buf, end - buf, true);
if (ret < 0)
pr_err("Reading PCI debug messages failed with code %d\n", ret);
else
buf += ret;
report->header.version = 1;
report->header.action = SCLP_ERRNOTIFY_AQ_INFO_LOG;
report->header.length = buf - (char *)&report->data;
report->data.timestamp = ktime_get_clocktai_seconds();
report->data.err_log_id = ZPCI_ERR_LOG_ID_KERNEL_REPORT;
ret = sclp_pci_report(&report->header, zdev->fh, zdev->fid);
if (ret)
pr_err("Reporting PCI status failed with code %d\n", ret);
else
pr_info("Reported PCI device status\n");
free_page((unsigned long)report);
return ret;
}

View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright IBM Corp. 2024
*
* Author(s):
* Niklas Schnelle <schnelle@linux.ibm.com>
*
*/
#ifndef __S390_PCI_REPORT_H
#define __S390_PCI_REPORT_H
struct zpci_dev;
int zpci_report_status(struct zpci_dev *zdev, const char *operation, const char *status);
#endif /* __S390_PCI_REPORT_H */

View File

@ -85,13 +85,6 @@ typedef unsigned int sclp_cmdw_t;
typedef u64 sccb_mask_t;
struct sccb_header {
u16 length;
u8 function_code;
u8 control_mask[3];
u16 response_code;
} __attribute__((packed));
struct init_sccb {
struct sccb_header header;
u16 _reserved;
@ -240,13 +233,6 @@ struct gds_vector {
u16 gds_id;
} __attribute__((packed));
struct evbuf_header {
u16 length;
u8 type;
u8 flags;
u16 _reserved;
} __attribute__((packed));
struct sclp_req {
struct list_head list; /* list_head for request queueing. */
sclp_cmdw_t command; /* sclp command to execute */

View File

@ -24,30 +24,11 @@
#define SCLP_ATYPE_PCI 2
#define SCLP_ERRNOTIFY_AQ_RESET 0
#define SCLP_ERRNOTIFY_AQ_REPAIR 1
#define SCLP_ERRNOTIFY_AQ_INFO_LOG 2
#define SCLP_ERRNOTIFY_AQ_OPTICS_DATA 3
static DEFINE_MUTEX(sclp_pci_mutex);
static struct sclp_register sclp_pci_event = {
.send_mask = EVTYP_ERRNOTIFY_MASK,
};
struct err_notify_evbuf {
struct evbuf_header header;
u8 action;
u8 atype;
u32 fh;
u32 fid;
u8 data[];
} __packed;
struct err_notify_sccb {
struct sccb_header header;
struct err_notify_evbuf evbuf;
} __packed;
struct pci_cfg_sccb {
struct sccb_header header;
u8 atype; /* adapter type */