diff --git a/app/models/miq_server/server_monitor.rb b/app/models/miq_server/server_monitor.rb index 02a771c8488..b9624c6fdce 100644 --- a/app/models/miq_server/server_monitor.rb +++ b/app/models/miq_server/server_monitor.rb @@ -16,25 +16,29 @@ def mark_as_not_responding(seconds = miq_server_time_threshold) end def make_master_server(last_master) - _log.info "Master server has #{last_master.nil? ? "not been set" : "died"}. Attempting takeover as new master server." - parent = MiqRegion.my_region + _log.info "Master server has #{last_master.nil? ? "not been set" : "died, #{last_master.name}"}. Attempting takeover as new master server, #{name}." + parent = MiqRegion.my_region(true) parent.lock do - all_servers = parent.miq_servers + # See if an ACTIVE server has already taken over + active_servers = parent.active_miq_servers _log.debug "Double checking that nothing has changed" - master = all_servers.detect(&:is_master?) + master = active_servers.detect(&:is_master?) if (last_master.nil? && !master.nil?) || (!last_master.nil? && !master.nil? && last_master.id != master.id) - _log.info "Aborting master server takeover as another server has taken control first." + _log.info "Aborting master server takeover as another server, #{master.name}, has taken control first." return nil end - _log.debug "Setting this server as master server" - all_servers.each do |s| + _log.debug "Setting this server, #{name}, as master server" + + # Set is_master on self, reset every other server in the region, including + # inactive ones. + parent.miq_servers.each do |s| s.is_master = (id == s.id) s.save! end end - _log.info "This server is now set as the master server" + _log.info "This server #{name} is now set as the master server, last_master: #{last_master.try(:name)}" self end diff --git a/spec/models/miq_server/server_monitor_spec.rb b/spec/models/miq_server/server_monitor_spec.rb index 9dcf727c24a..27d91379194 100644 --- a/spec/models/miq_server/server_monitor_spec.rb +++ b/spec/models/miq_server/server_monitor_spec.rb @@ -461,6 +461,55 @@ @miq_server3.reload end + it "should support multiple failover transitions from stopped master" do + # server1 is first to start, becomes master + @miq_server1.monitor_servers + + # Initialize the bookkeeping around current and last master + @miq_server2.monitor_servers + @miq_server3.monitor_servers + + # server1 is master + expect(@miq_server1.reload.is_master).to be_truthy + expect(@miq_server2.reload.is_master).to be_falsey + expect(@miq_server3.reload.is_master).to be_falsey + + # server 1 shuts down + @miq_server1.update(:status => "stopped") + + # server 3 becomes master, server 2 hasn't monitored servers yet + @miq_server3.monitor_servers + expect(@miq_server1.reload.is_master).to be_falsey + expect(@miq_server2.reload.is_master).to be_falsey + expect(@miq_server3.reload.is_master).to be_truthy + + # server 3 shuts down + @miq_server3.update(:status => "stopped") + + # server 2 finally gets to monitor_servers, takes over + @miq_server2.monitor_servers + expect(@miq_server1.reload.is_master).to be_falsey + expect(@miq_server2.reload.is_master).to be_truthy + expect(@miq_server3.reload.is_master).to be_falsey + end + + it "should failover from stopped master on startup" do + # server 1 is first to start, becomes master + @miq_server1.monitor_servers + + # server 1 shuts down + @miq_server1.update(:status => "stopped") + + # server 3 boots and hasn't run monitor_servers yet + expect(@miq_server1.reload.is_master).to be_truthy + expect(@miq_server3.reload.is_master).to be_falsey + + # server 3 runs monitor_servers and becomes master + @miq_server3.monitor_servers + expect(@miq_server1.reload.is_master).to be_falsey + expect(@miq_server3.reload.is_master).to be_truthy + end + it "should have all roles active after sync between them" do expect(@miq_server1.active_role_names.include?("ems_operations")).to be_truthy expect(@miq_server2.active_role_names.include?("ems_operations")).to be_truthy