Skip to content

Commit

Permalink
Add time based reconnect backoff
Browse files Browse the repository at this point in the history
Current logic can make the client try to reconnect nonstop very fast
until exhausting reconnect attempts when there are exceptions during
when trying to establish a connections.
  • Loading branch information
Waldemar Quevedo committed Mar 1, 2017
1 parent 90bcddd commit d2797ce
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 4 deletions.
38 changes: 36 additions & 2 deletions lib/nats/client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,18 @@ def process_pong
end

def should_delay_connect?(server)
server[:was_connected] && server[:reconnect_attempts] >= 0
case
when server[:was_connected] && server[:reconnect_attempts] >= 0
true
when server[:last_reconnect_attempt]
if (MonotonicTime.now - server[:last_reconnect_attempt] >= @options[:reconnect_time_wait])
true
else
true
end
else
false
end
end

def schedule_reconnect #:nodoc:
Expand All @@ -824,7 +835,7 @@ def unbind #:nodoc:
# Allow notifying from which server we were disconnected,
# but only when we didn't trigger disconnecting ourselves.
if @disconnect_cb and connected? and not closing?
disconnect_cb.call(NATS::ConnectError.new(disconnect_error_string))
@disconnect_cb.call(NATS::ConnectError.new(disconnect_error_string))
end

# If we are closing or shouldn't reconnect, go ahead and disconnect.
Expand Down Expand Up @@ -890,6 +901,10 @@ def can_reuse_server?(server) #:nodoc:
def attempt_reconnect #:nodoc:
@reconnect_timer = nil
current = server_pool.first

# Snapshot time when trying to reconnect to server
# in order to back off for subsequent attempts.
current[:last_reconnect_attempt] = MonotonicTime.now
current[:reconnect_attempts] ||= 0
current[:reconnect_attempts] += 1

Expand Down Expand Up @@ -981,4 +996,23 @@ def inspect #:nodoc:
"<nats client v#{NATS::VERSION}>"
end

class MonotonicTime
class << self
case
when defined?(Process::CLOCK_MONOTONIC)
def now
Process.clock_gettime(Process::CLOCK_MONOTONIC)
end
when RUBY_ENGINE == 'jruby'
def now
java.lang.System.nanoTime() / 1_000_000_000.0
end
else
def now
# Fallback to regular time behavior
::Time.now.to_f
end
end
end
end
end
4 changes: 2 additions & 2 deletions spec/client/client_tls_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -676,7 +676,7 @@
nc = NATS.connect(options) do |conn|
connects += 1
end
end.to raise_error
end.to raise_error(NATS::Error)
end
expect(errors.count).to eql(0)
expect(disconnects).to eql(0)
Expand Down Expand Up @@ -725,7 +725,7 @@
nc = NATS.connect(options) do |conn|
connects += 1
end
end.to raise_error
end.to raise_error(NATS::Error)
end

# No error here since it fails synchronously
Expand Down
54 changes: 54 additions & 0 deletions spec/client/reconnect_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,58 @@

expect(received).to eql(true)
end

it 'should not get stuck reconnecting due to uncaught exceptions' do
received = false
@as.kill_server

class SomeException < StandardError; end

expect do
EM.run do
NATS.connect(:uri => R_TEST_AUTH_SERVER, :reconnect => true, :max_reconnect_attempts => -1, :reconnect_time_wait => 0.25)
raise SomeException.new
end
end.to raise_error(SomeException)
end

it 'should back off trying to reconnect' do
@as.start_server

disconnected_time = nil
reconnected_time = nil
connected_once = false
EM.run do
NATS.on_disconnect do
# Capture the time of the first disconnect
disconnected_time ||= NATS::MonotonicTime.now
end

NATS.on_reconnect do
reconnected_time ||= NATS::MonotonicTime.now
end

NATS.connect(:uri => R_TEST_AUTH_SERVER, :reconnect => true, :max_reconnect_attempts => -1, :reconnect_time_wait => 2) do
connected_once = true
end

EM.add_timer(0.5) do
@as.kill_server
end

EM.add_timer(1) do
@as.start_server
end

EM.add_timer(3) do
NATS.stop
EM.stop
end
end

expect(connected_once).to eql(true)
expect(disconnected_time).to_not be(nil)
expect(reconnected_time).to_not be(nil)
expect(reconnected_time - disconnected_time >= 2).to eql(true)
end
end

0 comments on commit d2797ce

Please sign in to comment.