Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

split host health check failure types #108

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions include/envoy/upstream/upstream.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ class Host : virtual public HostDescription {
HostDescriptionPtr host_description_;
};

struct HealthFailures {
// The host is currently failing active health checks.
static const uint64_t ACTIVE_HC = 0x1;
};

/**
* @return host specific counters.
*/
Expand All @@ -44,14 +49,21 @@ class Host : virtual public HostDescription {
virtual std::list<std::reference_wrapper<Stats::Gauge>> gauges() const PURE;

/**
* @return bool whether the host is currently healthy and routable.
* @return all health failure states for the host. This is a logical OR of HealthFailures.
*/
virtual uint64_t healthFailures() const PURE;

/**
* Atomically clear a health failure state for a host. Failure states are specified in
* HealthFailures.
*/
virtual bool healthy() const PURE;
virtual void healthFailureClear(uint64_t failure) PURE;

/**
* Set whether the host is currently healthy and routable.
* Atomically set a health failure state for a host. Failure states are specified in
* HealthFailures.
*/
virtual void healthy(bool is_healthy) PURE;
virtual void healthFailureSet(uint64_t failure) PURE;

/**
* @return the current load balancing weight of the host, in the range 1-100.
Expand Down
19 changes: 10 additions & 9 deletions source/common/upstream/health_checker_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,13 @@ HealthCheckerImplBase::ActiveHealthCheckSession::ActiveHealthCheckSession(
interval_timer_(parent.dispatcher_.createTimer([this]() -> void { onInterval(); })),
timeout_timer_(parent.dispatcher_.createTimer([this]() -> void { onTimeout(); })) {

if (host->healthy()) {
if (!(host->healthFailures() & Host::HealthFailures::ACTIVE_HC)) {
parent.incHealthy();
}
}

HealthCheckerImplBase::ActiveHealthCheckSession::~ActiveHealthCheckSession() {
if (host_->healthy()) {
if (!(host_->healthFailures() & Host::HealthFailures::ACTIVE_HC)) {
parent_.decHealthy();
}
}
Expand All @@ -117,12 +117,12 @@ void HealthCheckerImplBase::ActiveHealthCheckSession::handleSuccess() {
num_unhealthy_ = 0;

bool changed_state = false;
if (!host_->healthy()) {
if (host_->healthFailures() & Host::HealthFailures::ACTIVE_HC) {
// If this is the first time we ever got a check result on this host, we immediately move
// it to healthy. This makes startup faster with a small reduction in overall reliability
// depending on the HC settings.
if (first_check_ || ++num_healthy_ == parent_.healthy_threshold_) {
host_->healthy(true);
host_->healthFailureClear(Host::HealthFailures::ACTIVE_HC);
parent_.incHealthy();
changed_state = true;
}
Expand All @@ -138,9 +138,9 @@ void HealthCheckerImplBase::ActiveHealthCheckSession::handleFailure(bool timeout
num_healthy_ = 0;

bool changed_state = false;
if (host_->healthy()) {
if (!(host_->healthFailures() & Host::HealthFailures::ACTIVE_HC)) {
if (!timeout || ++num_unhealthy_ == parent_.unhealthy_threshold_) {
host_->healthy(false);
host_->healthFailureSet(Host::HealthFailures::ACTIVE_HC);
parent_.decHealthy();
changed_state = true;
}
Expand Down Expand Up @@ -249,7 +249,7 @@ void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResetStream(Http::St
}

timeout_timer_->disableTimer();
conn_log_debug("connection/stream error host_healthy={}", *client_, host_->healthy());
conn_log_debug("connection/stream error health_failures={}", *client_, host_->healthFailures());
handleFailure(true);
interval_timer_->enableTimer(parent_.interval());
}
Expand All @@ -261,7 +261,8 @@ bool HttpHealthCheckerImpl::HttpActiveHealthCheckSession::isHealthCheckSucceeded
// the host is healthy, we need to see if we have reached the unhealthy count. If a host returns
// a response code other than 200 we ignore the number of unhealthy and immediately set it to
// unhealthy.
conn_log_debug("hc response={} host_healthy={}", *client_, response_code, host_->healthy());
conn_log_debug("hc response={} health_failures={}", *client_, response_code,
host_->healthFailures());

if (response_code != enumToInt(Http::Code::OK)) {
return false;
Expand Down Expand Up @@ -298,7 +299,7 @@ void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResponseComplete() {
}

void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onTimeout() {
conn_log_debug("connection/stream timeout host_healthy={}", *client_, host_->healthy());
conn_log_debug("connection/stream timeout health_failures={}", *client_, host_->healthFailures());
handleFailure(true);

// If there is an active request it will get reset, so make sure we ignore the reset.
Expand Down
8 changes: 5 additions & 3 deletions source/common/upstream/upstream_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ ClusterImplBase::ClusterImplBase(const Json::Object& config, Stats::Store& stats
ConstHostVectorPtr ClusterImplBase::createHealthyHostList(const std::vector<HostPtr>& hosts) {
HostVectorPtr healthy_list(new std::vector<HostPtr>());
for (auto host : hosts) {
if (host->healthy()) {
if (!host->healthFailures()) {
healthy_list->emplace_back(host);
}
}
Expand Down Expand Up @@ -213,14 +213,16 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const std::vector<HostPtr>& n
hosts_added.push_back(host);

// If we are depending on a health checker, we initialize to unhealthy.
hosts_added.back()->healthy(!depend_on_hc);
if (depend_on_hc) {
hosts_added.back()->healthFailureSet(Host::HealthFailures::ACTIVE_HC);
}
}
}

// If there are removed hosts, check to see if we should only delete if unhealthy.
if (!current_hosts.empty() && depend_on_hc) {
for (auto i = current_hosts.begin(); i != current_hosts.end();) {
if ((*i)->healthy()) {
if (!((*i)->healthFailures() & Host::HealthFailures::ACTIVE_HC)) {
if ((*i)->weight() > max_host_weight) {
max_host_weight = (*i)->weight();
}
Expand Down
7 changes: 4 additions & 3 deletions source/common/upstream/upstream_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,9 @@ class HostImpl : public HostDescriptionImpl,
std::list<std::reference_wrapper<Stats::Gauge>> gauges() const override {
return stats_store_.gauges();
}
bool healthy() const override { return healthy_; }
void healthy(bool is_healthy) override { healthy_ = is_healthy; }
uint64_t healthFailures() const override { return health_failures_; }
void healthFailureClear(uint64_t failure) override { health_failures_ &= ~failure; }
void healthFailureSet(uint64_t failure) override { health_failures_ |= failure; }
uint32_t weight() const override { return weight_; }
void weight(uint32_t new_weight);

Expand All @@ -77,7 +78,7 @@ class HostImpl : public HostDescriptionImpl,
createConnection(Event::Dispatcher& dispatcher, const Cluster& cluster, const std::string& url);

private:
std::atomic<bool> healthy_{true};
std::atomic<uint64_t> health_failures_{};
std::atomic<uint32_t> weight_;
};

Expand Down
4 changes: 2 additions & 2 deletions source/server/http/admin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,8 +117,8 @@ Http::Code AdminImpl::handlerClusters(const std::string&, Buffer::Instance& resp
stat.first, stat.second));
}

response.add(fmt::format("{}::{}::healthy::{}\n", cluster.second->name(), host->url(),
host->healthy()));
response.add(fmt::format("{}::{}::health_failures::{}\n", cluster.second->name(), host->url(),
host->healthFailures()));
response.add(
fmt::format("{}::{}::weight::{}\n", cluster.second->name(), host->url(), host->weight()));
response.add(
Expand Down
Loading