From eec5e1471007f51baddff4c96654a32cef7af2e9 Mon Sep 17 00:00:00 2001 From: Sven Krieger <37476281+svkrieger@users.noreply.github.com> Date: Tue, 8 Jul 2025 14:59:39 +0200 Subject: [PATCH] Add status endpoint to seperate webserver The /internal/v4/status endpoint gives more fine-granular insights in the health state of the CC. When at leats one Puma worker is idling it returns OK. If all workers are occupied, it returns BUSY. If all workers are busy and since 60s no requests have been processed, it returns UNHEALTHY. This endpoint will be used to prevent restarts even if CC is still healthy and working off peak-loads. --- lib/cloud_controller/runner.rb | 55 ++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/lib/cloud_controller/runner.rb b/lib/cloud_controller/runner.rb index 7c8040be52..409d235240 100644 --- a/lib/cloud_controller/runner.rb +++ b/lib/cloud_controller/runner.rb @@ -136,12 +136,17 @@ def setup_metrics setup_metrics_webserver end - # The webserver runs in the main process and serves only the metrics endpoint. - # This makes it possible to retrieve metrics even if all Puma workers of the main app are busy. + # The webserver runs in the main process and serves only the metrics and status endpoint. + # This makes it possible to retrieve both even if all Puma workers of the main app are busy. def setup_metrics_webserver + readiness_status_proc = method(:status) metrics_app = Rack::Builder.new do use Prometheus::Middleware::Exporter, path: '/internal/v4/metrics' + map '/internal/v4/status' do + run ->(_env) { readiness_status_proc.call } + end + map '/' do run lambda { |_env| # Return 404 for any other request @@ -163,6 +168,52 @@ def setup_metrics_webserver end end + # Persist state for status endpoint + @previous_requests_count_sum = nil + @last_requests_count_increase_time = nil + + def status + stats = Puma.stats_hash + worker_statuses = stats[:worker_status] + all_busy = all_workers_busy?(worker_statuses) + current_requests_count_sum = worker_requests_count_sum(worker_statuses) + + now = Time.now + prev = @previous_requests_count_sum + + # Track when requests_count_sum increases + @last_requests_count_increase_time = now if prev.nil? || current_requests_count_sum > prev + @previous_requests_count_sum = current_requests_count_sum + + unhealthy = false + if all_busy && @last_requests_count_increase_time && (now - @last_requests_count_increase_time) > 60 + # If requests_count_sum hasn't increased in 60 seconds, unhealthy + unhealthy = true + end + + if all_busy && unhealthy + [503, { 'Content-Type' => 'text/plain' }, ['UNHEALTHY']] + elsif all_busy + [429, { 'Content-Type' => 'text/plain' }, ['BUSY']] + else + [200, { 'Content-Type' => 'text/plain' }, ['OK']] + end + rescue StandardError => e + [500, { 'Content-Type' => 'text/plain' }, ["Readiness check error: #{e}"]] + end + + def all_workers_busy?(worker_statuses) + worker_statuses.all? do |worker| + worker[:last_status][:busy_threads] == worker[:last_status][:running] + end + end + + def worker_requests_count_sum(worker_statuses) + worker_statuses.sum do |worker| + worker[:last_status][:requests_count] || 0 + end + end + def setup_logging return if @setup_logging