mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2025-01-17 02:36:21 +00:00
net/mlx5: Drain fw_reset when removing device
In case fw sync reset is called in parallel to device removal, device might stuck in the following deadlock: CPU 0 CPU 1 ----- ----- remove_one uninit_one (locks intf_state_mutex) mlx5_sync_reset_now_event() work in fw_reset->wq. mlx5_enter_error_state() mutex_lock (intf_state_mutex) cleanup_once fw_reset_cleanup() destroy_workqueue(fw_reset->wq) Drain the fw_reset WQ, and make sure no new work is being queued, before entering uninit_one(). The Drain is done before devlink_unregister() since fw_reset, in some flows, is using devlink API devlink_remote_reload_actions_performed(). Fixes: 38b9f903f22b ("net/mlx5: Handle sync reset request event") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
This commit is contained in:
parent
04c551bad3
commit
16d42d3133
@ -8,7 +8,8 @@
|
||||
enum {
|
||||
MLX5_FW_RESET_FLAGS_RESET_REQUESTED,
|
||||
MLX5_FW_RESET_FLAGS_NACK_RESET_REQUEST,
|
||||
MLX5_FW_RESET_FLAGS_PENDING_COMP
|
||||
MLX5_FW_RESET_FLAGS_PENDING_COMP,
|
||||
MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS
|
||||
};
|
||||
|
||||
struct mlx5_fw_reset {
|
||||
@ -208,7 +209,10 @@ static void poll_sync_reset(struct timer_list *t)
|
||||
|
||||
if (fatal_error) {
|
||||
mlx5_core_warn(dev, "Got Device Reset\n");
|
||||
if (!test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags))
|
||||
queue_work(fw_reset->wq, &fw_reset->reset_reload_work);
|
||||
else
|
||||
mlx5_core_err(dev, "Device is being removed, Drop new reset work\n");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -433,6 +437,9 @@ static int fw_reset_event_notifier(struct notifier_block *nb, unsigned long acti
|
||||
struct mlx5_fw_reset *fw_reset = mlx5_nb_cof(nb, struct mlx5_fw_reset, nb);
|
||||
struct mlx5_eqe *eqe = data;
|
||||
|
||||
if (test_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags))
|
||||
return NOTIFY_DONE;
|
||||
|
||||
switch (eqe->sub_type) {
|
||||
case MLX5_GENERAL_SUBTYPE_FW_LIVE_PATCH_EVENT:
|
||||
queue_work(fw_reset->wq, &fw_reset->fw_live_patch_work);
|
||||
@ -479,6 +486,18 @@ void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev)
|
||||
mlx5_eq_notifier_unregister(dev, &dev->priv.fw_reset->nb);
|
||||
}
|
||||
|
||||
void mlx5_drain_fw_reset(struct mlx5_core_dev *dev)
|
||||
{
|
||||
struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
|
||||
|
||||
set_bit(MLX5_FW_RESET_FLAGS_DROP_NEW_REQUESTS, &fw_reset->reset_flags);
|
||||
cancel_work_sync(&fw_reset->fw_live_patch_work);
|
||||
cancel_work_sync(&fw_reset->reset_request_work);
|
||||
cancel_work_sync(&fw_reset->reset_reload_work);
|
||||
cancel_work_sync(&fw_reset->reset_now_work);
|
||||
cancel_work_sync(&fw_reset->reset_abort_work);
|
||||
}
|
||||
|
||||
int mlx5_fw_reset_init(struct mlx5_core_dev *dev)
|
||||
{
|
||||
struct mlx5_fw_reset *fw_reset = kzalloc(sizeof(*fw_reset), GFP_KERNEL);
|
||||
|
@ -16,6 +16,7 @@ int mlx5_fw_reset_set_live_patch(struct mlx5_core_dev *dev);
|
||||
int mlx5_fw_reset_wait_reset_done(struct mlx5_core_dev *dev);
|
||||
void mlx5_fw_reset_events_start(struct mlx5_core_dev *dev);
|
||||
void mlx5_fw_reset_events_stop(struct mlx5_core_dev *dev);
|
||||
void mlx5_drain_fw_reset(struct mlx5_core_dev *dev);
|
||||
int mlx5_fw_reset_init(struct mlx5_core_dev *dev);
|
||||
void mlx5_fw_reset_cleanup(struct mlx5_core_dev *dev);
|
||||
|
||||
|
@ -1627,6 +1627,10 @@ static void remove_one(struct pci_dev *pdev)
|
||||
struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
|
||||
struct devlink *devlink = priv_to_devlink(dev);
|
||||
|
||||
/* mlx5_drain_fw_reset() is using devlink APIs. Hence, we must drain
|
||||
* fw_reset before unregistering the devlink.
|
||||
*/
|
||||
mlx5_drain_fw_reset(dev);
|
||||
devlink_unregister(devlink);
|
||||
mlx5_sriov_disable(pdev);
|
||||
mlx5_crdump_disable(dev);
|
||||
|
Loading…
x
Reference in New Issue
Block a user