Skip to content

Commit

Permalink
net/mlx5: Start health poll at earlier stage of driver load
Browse files Browse the repository at this point in the history
Start health poll at earlier stage, so if fw fatal issue occurred before
or during initialization commands such as init_hca or set_hca_cap the
poll health can detect and indicate that the driver is already in error
state.

Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
mosheshemesh2 authored and kuba-moo committed Oct 3, 2022
1 parent 16ab85e commit 9b98d39
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 10 deletions.
11 changes: 8 additions & 3 deletions drivers/net/ethernet/mellanox/mlx5/core/health.c
Original file line number Diff line number Diff line change
Expand Up @@ -843,9 +843,6 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)

health->timer.expires = jiffies + msecs_to_jiffies(poll_interval_ms);
add_timer(&health->timer);

if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
}

void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
Expand All @@ -862,6 +859,14 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
del_timer_sync(&health->timer);
}

void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;

if (mlx5_core_is_pf(dev) && MLX5_CAP_MCAM_REG(dev, mrtc))
queue_delayed_work(health->wq, &health->update_fw_log_ts_work, 0);
}

void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
{
struct mlx5_core_health *health = &dev->priv.health;
Expand Down
17 changes: 10 additions & 7 deletions drivers/net/ethernet/mellanox/mlx5/core/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -1092,7 +1092,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
mlx5_devcom_unregister_device(dev->priv.devcom);
}

static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot, u64 timeout)
{
int err;

Expand Down Expand Up @@ -1130,10 +1130,12 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)

mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_UP);

mlx5_start_health_poll(dev);

err = mlx5_core_enable_hca(dev, 0);
if (err) {
mlx5_core_err(dev, "enable hca failed\n");
goto err_cmd_cleanup;
goto stop_health_poll;
}

err = mlx5_core_set_issi(dev);
Expand Down Expand Up @@ -1185,15 +1187,16 @@ static int mlx5_function_setup(struct mlx5_core_dev *dev, u64 timeout)
mlx5_core_err(dev, "query hca failed\n");
goto reclaim_boot_pages;
}

mlx5_start_health_poll(dev);
mlx5_start_health_fw_log_up(dev);

return 0;

reclaim_boot_pages:
mlx5_reclaim_startup_pages(dev);
err_disable_hca:
mlx5_core_disable_hca(dev, 0);
stop_health_poll:
mlx5_stop_health_poll(dev, boot);
err_cmd_cleanup:
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
mlx5_cmd_cleanup(dev);
Expand All @@ -1205,14 +1208,14 @@ static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
{
int err;

mlx5_stop_health_poll(dev, boot);
err = mlx5_cmd_teardown_hca(dev);
if (err) {
mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n");
return err;
}
mlx5_reclaim_startup_pages(dev);
mlx5_core_disable_hca(dev, 0);
mlx5_stop_health_poll(dev, boot);
mlx5_cmd_set_state(dev, MLX5_CMDIF_STATE_DOWN);
mlx5_cmd_cleanup(dev);

Expand Down Expand Up @@ -1362,7 +1365,7 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
mutex_lock(&dev->intf_state_mutex);
dev->state = MLX5_DEVICE_STATE_UP;

err = mlx5_function_setup(dev, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
err = mlx5_function_setup(dev, true, mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT));
if (err)
goto err_function;

Expand Down Expand Up @@ -1450,7 +1453,7 @@ int mlx5_load_one_devl_locked(struct mlx5_core_dev *dev, bool recovery)
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_ON_RECOVERY_TIMEOUT);
else
timeout = mlx5_tout_ms(dev, FW_PRE_INIT_TIMEOUT);
err = mlx5_function_setup(dev, timeout);
err = mlx5_function_setup(dev, false, timeout);
if (err)
goto err_function;

Expand Down
1 change: 1 addition & 0 deletions include/linux/mlx5/driver.h
Original file line number Diff line number Diff line change
Expand Up @@ -1017,6 +1017,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev);
int mlx5_health_init(struct mlx5_core_dev *dev);
void mlx5_start_health_poll(struct mlx5_core_dev *dev);
void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
void mlx5_start_health_fw_log_up(struct mlx5_core_dev *dev);
void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
void mlx5_trigger_health_work(struct mlx5_core_dev *dev);
int mlx5_frag_buf_alloc_node(struct mlx5_core_dev *dev, int size,
Expand Down

0 comments on commit 9b98d39

Please sign in to comment.