Skip to content

Commit

Permalink
Merge pull request #8493 from rabbitmq/mergify/bp/v3.11.x/pr-8491
Browse files Browse the repository at this point in the history
rabbit_feature_flags: Retry after `erpc:call()` fails with `noconnection` (backport #8411) (backport #8491)
  • Loading branch information
dumbbell committed Jun 6, 2023
2 parents a9a32e9 + b61b7a7 commit b3e4df7
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 0 deletions.
24 changes: 24 additions & 0 deletions deps/rabbit/src/rabbit_ff_controller.erl
Original file line number Diff line number Diff line change
Expand Up @@ -1175,6 +1175,8 @@ this_node_first(Nodes) ->
Ret :: term() | {error, term()}.

rpc_call(Node, Module, Function, Args, Timeout) ->
SleepBetweenRetries = 5000,
T0 = erlang:monotonic_time(),
try
erpc:call(Node, Module, Function, Args, Timeout)
catch
Expand All @@ -1195,6 +1197,28 @@ rpc_call(Node, Module, Function, Args, Timeout) ->
[Module, Function, Args, Node],
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
{error, pre_feature_flags_rabbitmq};

%% In case of `noconnection' with `Timeout'=infinity, we don't retry
%% at all. This is because the infinity "timeout" is used to run
%% callbacks on remote node and they can last an indefinite amount of
%% time, for instance, if there is a lot of data to migrate.
error:{erpc, noconnection} = Reason
when is_integer(Timeout) andalso Timeout > SleepBetweenRetries ->
?LOG_ERROR(
"Feature flags: no connection to node `~ts`; "
"retrying in ~b milliseconds",
[Node, SleepBetweenRetries],
#{domain => ?RMQLOG_DOMAIN_FEAT_FLAGS}),
timer:sleep(SleepBetweenRetries),
T1 = erlang:monotonic_time(),
TDiff = erlang:convert_time_unit(T1 - T0, native, millisecond),
Remaining = Timeout - TDiff,
Timeout1 = erlang:max(Remaining, 0),
case Timeout1 of
0 -> {error, Reason};
_ -> rpc_call(Node, Module, Function, Args, Timeout1)
end;

Class:Reason:Stacktrace ->
Message0 = erl_error:format_exception(Class, Reason, Stacktrace),
Message1 = lists:flatten(Message0),
Expand Down
29 changes: 29 additions & 0 deletions deps/rabbit/test/feature_flags_v2_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
mf_wait_and_count_runs_v2_post_enable/1,
mf_crash_on_joining_node/1,

rpc_calls/1,
enable_unknown_feature_flag_on_a_single_node/1,
enable_supported_feature_flag_on_a_single_node/1,
enable_unknown_feature_flag_in_a_3node_cluster/1,
Expand All @@ -38,7 +39,9 @@
enable_feature_flag_in_cluster_and_add_member_after/1,
enable_feature_flag_in_cluster_and_add_member_concurrently_mfv1/1,
enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2/1,
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1/0,
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1/1,
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/0,
enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2/1,
enable_feature_flag_with_post_enable/1,
have_required_feature_flag_in_cluster_and_add_member_with_it_disabled/1,
Expand All @@ -58,6 +61,10 @@ all() ->
groups() ->
Groups =
[
{direct, [parallel],
[
rpc_calls
]},
{cluster_size_1, [parallel],
[
enable_unknown_feature_flag_on_a_single_node,
Expand Down Expand Up @@ -103,6 +110,8 @@ init_per_group(feature_flags_v1, Config) ->
rabbit_ct_helpers:set_config(Config, {enable_feature_flags_v2, false});
init_per_group(feature_flags_v2, Config) ->
rabbit_ct_helpers:set_config(Config, {enable_feature_flags_v2, true});
init_per_group(direct, Config) ->
Config;
init_per_group(cluster_size_1, Config) ->
rabbit_ct_helpers:set_config(Config, {nodes_count, 1});
init_per_group(cluster_size_3, Config) ->
Expand All @@ -113,11 +122,15 @@ init_per_group(_Group, Config) ->
end_per_group(_Group, Config) ->
Config.

init_per_testcase(rpc_calls, Config) ->
Config;
init_per_testcase(Testcase, Config) ->
rabbit_ct_helpers:run_steps(
Config,
[fun(Cfg) -> start_slave_nodes(Cfg, Testcase) end]).

end_per_testcase(rpc_calls, Config) ->
Config;
end_per_testcase(_Testcase, Config) ->
rabbit_ct_helpers:run_steps(
Config,
Expand Down Expand Up @@ -369,6 +382,16 @@ get_peer_proc() ->
%% Testcases.
%% -------------------------------------------------------------------

rpc_calls(_Config) ->
List = [1, 2, 3],
?assertEqual(
lists:sum(List),
rabbit_ff_controller:rpc_call(node(), lists, sum, [List], 11000)),
?assertEqual(
{error, {erpc, noconnection}},
rabbit_ff_controller:rpc_call(
nonode@non_existing_host, lists, sum, [List], 11000)).

enable_unknown_feature_flag_on_a_single_node(Config) ->
[Node] = ?config(nodes, Config),
ok = run_on_node(
Expand Down Expand Up @@ -1148,6 +1171,9 @@ enable_feature_flag_in_cluster_and_add_member_concurrently_mfv2(Config) ->
|| Node <- AllNodes],
ok.

enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1() ->
[{timetrap, {minutes, 3}}].

enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1(Config) ->
AllNodes = [LeavingNode | [FirstNode | _] = Nodes] = ?config(
nodes, Config),
Expand Down Expand Up @@ -1255,6 +1281,9 @@ enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv1(Config) ->
|| Node <- Nodes],
ok.

enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2() ->
[{timetrap, {minutes, 3}}].

enable_feature_flag_in_cluster_and_remove_member_concurrently_mfv2(Config) ->
AllNodes = [LeavingNode | [FirstNode | _] = Nodes] = ?config(
nodes, Config),
Expand Down

0 comments on commit b3e4df7

Please sign in to comment.