From 10c783df6c9adbe18b7495e83649f52249c895fd Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Mon, 17 Jun 2019 14:41:49 -0700 Subject: [PATCH] Improve on_connected callback to avoid reconnect storm (#292) * Ensure there is only one watcher refresh per time window * Fix indentation * Fix typo * Refactor and unit test reconnect time * Fix ruby 1.9.3 ci * Move to sleep after test_and_set * revert .travis.yml * Fix typo * Bump up version to 0.16.12 --- Gemfile.lock | 2 +- lib/synapse/service_watcher/zookeeper.rb | 32 +++++++++++++++++++ lib/synapse/version.rb | 2 +- .../synapse/service_watcher_zookeeper_spec.rb | 11 +++++++ 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 6d409093..8bdd60e9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,7 +1,7 @@ PATH remote: . specs: - synapse (0.16.11) + synapse (0.16.12) aws-sdk (~> 1.39) docker-api (~> 1.7) dogstatsd-ruby (~> 3.3.0) diff --git a/lib/synapse/service_watcher/zookeeper.rb b/lib/synapse/service_watcher/zookeeper.rb index 57d867c9..ccf0350a 100644 --- a/lib/synapse/service_watcher/zookeeper.rb +++ b/lib/synapse/service_watcher/zookeeper.rb @@ -363,6 +363,27 @@ def zk_connect zk_cleanup end + # handle session connected after reconnecting + # http://zookeeper.apache.org/doc/r3.3.5/zookeeperProgrammers.html#ch_zkSessions + @zk.on_connected do + log.info "synapse: ZK client has reconnected #{@name}" + unless test_and_set_reconnect_time(Time.now) + log.info "synapse: ZK client skip since last reconnect is too close #{@name}" + return + end + + # random backoff to avoid refreshing all watchers at the same time + sleep rand(30) + + # zookeeper watcher is one-time trigger, and can be lost when disconnected + # https://zookeeper.apache.org/doc/r3.3.5/zookeeperProgrammers.html#ch_zkWatches + # only need re-enable watcher on parent path and children list + log.info "synapse: ZK client refresh watcher after reconnected #{@name}" + if @zk.exists?(@discovery['path'], :watch => true) + @zk.children(@discovery['path'], :watch => true) + end + end + # the path must exist, otherwise watch callbacks will not work statsd_time('synapse.watcher.zk.create_path.elapsed_time', ["zk_cluster:#{@zk_cluster}", "service_name:#{@name}"]) do create(@discovery['path']) @@ -373,6 +394,17 @@ def zk_connect end end + def test_and_set_reconnect_time(now) + # ensure there is only one refresh can happen within a time window + if !@last_reconnect_time.nil? && (now - @last_reconnect_time) < 60 + return false + end + # test-and-set should be thread safe based on per-callback model + # https://github.com/zk-ruby/zk/wiki/EventDeliveryModel + @last_reconnect_time = now + true + + end # decode the data at a zookeeper endpoint def deserialize_service_instance(data) log.debug "synapse: deserializing process data" diff --git a/lib/synapse/version.rb b/lib/synapse/version.rb index 12e5f56e..61ae47fe 100644 --- a/lib/synapse/version.rb +++ b/lib/synapse/version.rb @@ -1,3 +1,3 @@ module Synapse - VERSION = "0.16.11" + VERSION = "0.16.12" end diff --git a/spec/lib/synapse/service_watcher_zookeeper_spec.rb b/spec/lib/synapse/service_watcher_zookeeper_spec.rb index 0f2f9f4c..867719af 100644 --- a/spec/lib/synapse/service_watcher_zookeeper_spec.rb +++ b/spec/lib/synapse/service_watcher_zookeeper_spec.rb @@ -149,6 +149,17 @@ subject.send(:watcher_callback).call end + it 'test and set reconnect time' do + now = Time.now + expect(subject.send(:test_and_set_reconnect_time, now)).to be true + now += 10 + expect(subject.send(:test_and_set_reconnect_time, now)).to be false + now += 50 + expect(subject.send(:test_and_set_reconnect_time, now)).to be true + now += 60 + expect(subject.send(:test_and_set_reconnect_time, now)).to be true + end + it 'handles zk consistency issues' do expect(subject).to receive(:watch) expect(subject).to receive(:discover).and_call_original