diff --git a/.go-version b/.go-version index 229a27c6f2..3dfc2ecf95 100644 --- a/.go-version +++ b/.go-version @@ -1 +1 @@ -1.22.8 +1.22.11 diff --git a/docs/install/iam_policy.json b/docs/install/iam_policy.json index 1a5b4d614b..1d6f01b79a 100644 --- a/docs/install/iam_policy.json +++ b/docs/install/iam_policy.json @@ -42,7 +42,8 @@ "elasticloadbalancing:DescribeTags", "elasticloadbalancing:DescribeTrustStores", "elasticloadbalancing:DescribeListenerAttributes", - "elasticloadbalancing:DescribeCapacityReservation" + "elasticloadbalancing:DescribeCapacityReservation", + "tag:GetResources" ], "Resource": "*" }, diff --git a/docs/install/iam_policy_cn.json b/docs/install/iam_policy_cn.json index ba8a39fa79..85e4c13ead 100644 --- a/docs/install/iam_policy_cn.json +++ b/docs/install/iam_policy_cn.json @@ -42,7 +42,8 @@ "elasticloadbalancing:DescribeTags", "elasticloadbalancing:DescribeTrustStores", "elasticloadbalancing:DescribeListenerAttributes", - "elasticloadbalancing:DescribeCapacityReservation" + "elasticloadbalancing:DescribeCapacityReservation", + "tag:GetResources" ], "Resource": "*" }, diff --git a/docs/install/iam_policy_iso.json b/docs/install/iam_policy_iso.json index bd87af7f5e..2cd6080863 100644 --- a/docs/install/iam_policy_iso.json +++ b/docs/install/iam_policy_iso.json @@ -39,7 +39,8 @@ "elasticloadbalancing:DescribeTargetGroups", "elasticloadbalancing:DescribeTargetGroupAttributes", "elasticloadbalancing:DescribeTargetHealth", - "elasticloadbalancing:DescribeTags" + "elasticloadbalancing:DescribeTags", + "tag:GetResources" ], "Resource": "*" }, diff --git a/docs/install/iam_policy_isob.json b/docs/install/iam_policy_isob.json index e552c458fe..112c3fb644 100644 --- a/docs/install/iam_policy_isob.json +++ b/docs/install/iam_policy_isob.json @@ -39,7 +39,8 @@ "elasticloadbalancing:DescribeTargetGroups", "elasticloadbalancing:DescribeTargetGroupAttributes", "elasticloadbalancing:DescribeTargetHealth", - "elasticloadbalancing:DescribeTags" + "elasticloadbalancing:DescribeTags", + "tag:GetResources" ], "Resource": "*" }, diff --git a/docs/install/iam_policy_isoe.json b/docs/install/iam_policy_isoe.json index 9afb8d4fab..fb337e4a05 100644 --- a/docs/install/iam_policy_isoe.json +++ b/docs/install/iam_policy_isoe.json @@ -39,7 +39,8 @@ "elasticloadbalancing:DescribeTargetGroups", "elasticloadbalancing:DescribeTargetGroupAttributes", "elasticloadbalancing:DescribeTargetHealth", - "elasticloadbalancing:DescribeTags" + "elasticloadbalancing:DescribeTags", + "tag:GetResources" ], "Resource": "*" }, diff --git a/docs/install/iam_policy_isof.json b/docs/install/iam_policy_isof.json index 2c5054393b..2ef5f66178 100644 --- a/docs/install/iam_policy_isof.json +++ b/docs/install/iam_policy_isof.json @@ -39,7 +39,8 @@ "elasticloadbalancing:DescribeTargetGroups", "elasticloadbalancing:DescribeTargetGroupAttributes", "elasticloadbalancing:DescribeTargetHealth", - "elasticloadbalancing:DescribeTags" + "elasticloadbalancing:DescribeTags", + "tag:GetResources" ], "Resource": "*" }, diff --git a/docs/install/iam_policy_us-gov.json b/docs/install/iam_policy_us-gov.json index 828f77f4d8..9d665a1e65 100644 --- a/docs/install/iam_policy_us-gov.json +++ b/docs/install/iam_policy_us-gov.json @@ -42,7 +42,8 @@ "elasticloadbalancing:DescribeTags", "elasticloadbalancing:DescribeTrustStores", "elasticloadbalancing:DescribeListenerAttributes", - "elasticloadbalancing:DescribeCapacityReservation" + "elasticloadbalancing:DescribeCapacityReservation", + "tag:GetResources" ], "Resource": "*" }, diff --git a/main.go b/main.go index c4641fb109..07b79a1ea1 100644 --- a/main.go +++ b/main.go @@ -17,11 +17,6 @@ limitations under the License. package main import ( - "k8s.io/client-go/util/workqueue" - "os" - - elbv2deploy "sigs.k8s.io/aws-load-balancer-controller/pkg/deploy/elbv2" - "github.com/go-logr/logr" "github.com/spf13/pflag" zapraw "go.uber.org/zap" @@ -29,7 +24,9 @@ import ( "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" + "k8s.io/client-go/util/workqueue" "k8s.io/klog/v2" + "os" elbv2api "sigs.k8s.io/aws-load-balancer-controller/apis/elbv2/v1beta1" elbv2controller "sigs.k8s.io/aws-load-balancer-controller/controllers/elbv2" "sigs.k8s.io/aws-load-balancer-controller/controllers/ingress" @@ -37,6 +34,7 @@ import ( "sigs.k8s.io/aws-load-balancer-controller/pkg/aws" "sigs.k8s.io/aws-load-balancer-controller/pkg/aws/throttle" "sigs.k8s.io/aws-load-balancer-controller/pkg/config" + elbv2deploy "sigs.k8s.io/aws-load-balancer-controller/pkg/deploy/elbv2" "sigs.k8s.io/aws-load-balancer-controller/pkg/inject" "sigs.k8s.io/aws-load-balancer-controller/pkg/k8s" awsmetrics "sigs.k8s.io/aws-load-balancer-controller/pkg/metrics/aws" @@ -52,6 +50,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics" + "time" // +kubebuilder:scaffold:imports ) @@ -84,8 +83,6 @@ func main() { klog.SetLoggerWithOptions(appLogger, klog.ContextualLogger(true)) var awsMetricsCollector *awsmetrics.Collector - lbcMetricsCollector := lbcmetrics.NewCollector(metrics.Registry) - if metrics.Registry != nil { awsMetricsCollector = awsmetrics.NewCollector(metrics.Registry) } @@ -107,6 +104,17 @@ func main() { os.Exit(1) } + // track the k8s resources with finalizers contains "k8s.aws" + // track the aws resources with cluster tag "elbv2.k8s.aws/cluster=$ClusterName" + lbcMetricsCollector := lbcmetrics.NewCollector( + metrics.Registry, + mgr.GetClient(), + cloud.RGT(), + "k8s.aws", + "elbv2.k8s.aws/cluster", + controllerCFG.ClusterName, + ) + clientSet, err := kubernetes.NewForConfig(mgr.GetConfig()) if err != nil { setupLog.Error(err, "unable to obtain clientSet") @@ -202,6 +210,28 @@ func main() { deferredTGBQueue.Run() }() + // TODO: we can better improve this to update the metrics per reconcile + go func() { + ticker := time.NewTicker(2 * time.Minute) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + setupLog.Info("updating managed resource metrics") + if err := lbcMetricsCollector.UpdateManagedK8sResourceMetrics(ctx); err != nil { + setupLog.Error(err, "failed to update managed Kubernetes resource metrics") + } + if err := lbcMetricsCollector.UpdateManagedALBMetrics(ctx); err != nil { + setupLog.Error(err, "failed to update managed ALB metrics") + } + if err := lbcMetricsCollector.UpdateManagedNLBMetrics(ctx); err != nil { + setupLog.Error(err, "failed to update managed NLB metrics") + } + } + } + }() + if err := podInfoRepo.WaitForCacheSync(ctx); err != nil { setupLog.Error(err, "problem wait for podInfo repo sync") os.Exit(1) diff --git a/pkg/metrics/aws/instruments.go b/pkg/metrics/aws/instruments.go index 150bf11019..b5c945118b 100644 --- a/pkg/metrics/aws/instruments.go +++ b/pkg/metrics/aws/instruments.go @@ -59,8 +59,13 @@ func newInstruments(registerer prometheus.Registerer) *instruments { Name: metricAPIRequestDurationSeconds, Help: "Latency of an individual HTTP request to the service endpoint", }, []string{labelService, labelOperation}) - - registerer.MustRegister(apiCallsTotal, apiCallDurationSeconds, apiCallRetries, apiRequestsTotal, apiRequestDurationSecond) + registerer.MustRegister( + apiCallsTotal, + apiCallDurationSeconds, + apiCallRetries, + apiRequestsTotal, + apiRequestDurationSecond, + ) return &instruments{ apiCallsTotal: apiCallsTotal, apiCallDurationSeconds: apiCallDurationSeconds, diff --git a/pkg/metrics/lbc/collector.go b/pkg/metrics/lbc/collector.go index 34da128486..eb6bd43a2e 100644 --- a/pkg/metrics/lbc/collector.go +++ b/pkg/metrics/lbc/collector.go @@ -1,18 +1,49 @@ package lbc import ( + "context" + awssdk "github.com/aws/aws-sdk-go-v2/aws" + rgtsdk "github.com/aws/aws-sdk-go-v2/service/resourcegroupstaggingapi" + rgttypes "github.com/aws/aws-sdk-go-v2/service/resourcegroupstaggingapi/types" "github.com/prometheus/client_golang/prometheus" + corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" + elbv2api "sigs.k8s.io/aws-load-balancer-controller/apis/elbv2/v1beta1" + "sigs.k8s.io/aws-load-balancer-controller/pkg/aws/services" + "strings" + + "sigs.k8s.io/controller-runtime/pkg/client" "time" ) +const ( + networkLoadBalancerStr = "nlb" + resourceTypeALB = "elasticloadbalancing:loadbalancer/app" + resourceTypeNLB = "elasticloadbalancing:loadbalancer/net" +) + type MetricCollector interface { // ObservePodReadinessGateReady this metric is useful to determine how fast pods are becoming ready in the load balancer. // Due to some architectural constraints, we can only emit this metric for pods that are using readiness gates. ObservePodReadinessGateReady(namespace string, tgbName string, duration time.Duration) + + // UpdateManagedK8sResourceMetrics fetches and updates managed k8s resources metrics. + UpdateManagedK8sResourceMetrics(ctx context.Context) error + + // UpdateManagedALBMetrics updates managed ALB count metrics + UpdateManagedALBMetrics(ctx context.Context) error + + //UpdateManagedNLBMetrics updates managed NLB count metrics + UpdateManagedNLBMetrics(ctx context.Context) error } type collector struct { - instruments *instruments + instruments *instruments + runtimeClient client.Client + rgt services.RGT + finalizerKeyWord string + clusterTagKey string + clusterTagVal string } type noOpCollector struct{} @@ -20,14 +51,31 @@ type noOpCollector struct{} func (n *noOpCollector) ObservePodReadinessGateReady(_ string, _ string, _ time.Duration) { } -func NewCollector(registerer prometheus.Registerer) MetricCollector { - if registerer == nil { +func (n *noOpCollector) UpdateManagedK8sResourceMetrics(_ context.Context) error { + return nil +} + +func (n *noOpCollector) UpdateManagedALBMetrics(_ context.Context) error { + return nil +} + +func (n *noOpCollector) UpdateManagedNLBMetrics(_ context.Context) error { + return nil +} + +func NewCollector(registerer prometheus.Registerer, runtimeClient client.Client, rgt services.RGT, finalizerKeyWord string, clusterTagKey string, clusterTagVal string) MetricCollector { + if registerer == nil || runtimeClient == nil { return &noOpCollector{} } instruments := newInstruments(registerer) return &collector{ - instruments: instruments, + instruments: instruments, + runtimeClient: runtimeClient, + rgt: rgt, + finalizerKeyWord: finalizerKeyWord, + clusterTagKey: clusterTagKey, + clusterTagVal: clusterTagVal, } } @@ -37,3 +85,99 @@ func (c *collector) ObservePodReadinessGateReady(namespace string, tgbName strin labelName: tgbName, }).Observe(duration.Seconds()) } + +func (c *collector) UpdateManagedK8sResourceMetrics(ctx context.Context) error { + listOpts := &client.ListOptions{ + Namespace: "", + } + ingressCount, serviceCount, tgbCount := 0, 0, 0 + // Fetch ingress count + ingressList := &networkingv1.IngressList{} + err := c.runtimeClient.List(ctx, ingressList, listOpts) + if err != nil { + return err + } + for _, ingress := range ingressList.Items { + for _, finalizer := range ingress.Finalizers { + if strings.Contains(finalizer, c.finalizerKeyWord) { + ingressCount++ + break + } + } + } + c.instruments.managedIngressCount.Set(float64(ingressCount)) + + // Fetch service count + serviceList := &corev1.ServiceList{} + err = c.runtimeClient.List(ctx, serviceList, listOpts) + if err != nil { + return err + } + for _, service := range serviceList.Items { + hasMatchingFinalizer := false + for _, finalizer := range service.Finalizers { + if strings.Contains(finalizer, c.finalizerKeyWord) { + hasMatchingFinalizer = true + break + } + } + + if hasMatchingFinalizer && service.Spec.LoadBalancerClass != nil && strings.Contains(*service.Spec.LoadBalancerClass, networkLoadBalancerStr) { + serviceCount++ + } + } + c.instruments.managedServiceCount.Set(float64(serviceCount)) + + // Fetch TargetGroupBinding count + tgbList := &elbv2api.TargetGroupBindingList{} + err = c.runtimeClient.List(ctx, tgbList, listOpts) + if err != nil { + return err + } + for _, tgb := range tgbList.Items { + for _, finalizer := range tgb.Finalizers { + if strings.Contains(finalizer, c.finalizerKeyWord) { + tgbCount++ + break + } + } + } + c.instruments.managedTGBCount.Set(float64(tgbCount)) + + return nil +} + +func (c *collector) UpdateManagedALBMetrics(ctx context.Context) error { + count, err := c.getManagedAWSResourceMetrics(ctx, resourceTypeALB) + if err != nil { + return err + } + c.instruments.managedALBCount.Set(float64(count)) + return nil +} + +func (c *collector) UpdateManagedNLBMetrics(ctx context.Context) error { + count, err := c.getManagedAWSResourceMetrics(ctx, resourceTypeNLB) + if err != nil { + return err + } + c.instruments.managedNLBCount.Set(float64(count)) + return nil +} + +func (c *collector) getManagedAWSResourceMetrics(ctx context.Context, resourceType string) (count int, err error) { + req := &rgtsdk.GetResourcesInput{ + ResourceTypeFilters: []string{resourceType}, + TagFilters: []rgttypes.TagFilter{ + { + Key: awssdk.String(c.clusterTagKey), + Values: []string{c.clusterTagVal}, + }, + }, + } + resources, err := c.rgt.GetResourcesAsList(ctx, req) + if err != nil { + return 0, err + } + return len(resources), nil +} diff --git a/pkg/metrics/lbc/instruments.go b/pkg/metrics/lbc/instruments.go index 8fafd2f987..a81963d298 100644 --- a/pkg/metrics/lbc/instruments.go +++ b/pkg/metrics/lbc/instruments.go @@ -21,6 +21,11 @@ const ( type instruments struct { podReadinessFlipSeconds *prometheus.HistogramVec + managedIngressCount prometheus.Gauge + managedServiceCount prometheus.Gauge + managedTGBCount prometheus.Gauge + managedALBCount prometheus.Gauge + managedNLBCount prometheus.Gauge } // newInstruments allocates and register new metrics to registerer @@ -31,9 +36,40 @@ func newInstruments(registerer prometheus.Registerer) *instruments { Help: "Latency from pod getting added to the load balancer until the readiness gate is flipped to healthy.", Buckets: []float64{10, 30, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600}, }, []string{labelNamespace, labelName}) - - registerer.MustRegister(podReadinessFlipSeconds) + managedIngressCount := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "lb_controller_managed_ingress_count", + Help: "Number of ingresses managed by the AWS Load Balancer Controller.", + }) + managedServiceCount := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "lb_controller_managed_service_count", + Help: "Number of service type Load Balancers (NLBs) managed by the AWS Load Balancer Controller.", + }) + managedTGBCount := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "lb_controller_managed_targetgroupbinding_count", + Help: "Number of targetgroupbindings managed by the AWS Load Balancer Controller.", + }) + managedALBCount := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "lb_controller_managed_albs_total", + Help: "Current number of ALBs managed by the controller", + }) + managedNLBCount := prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "lb_controller_managed_nlbs_total", + Help: "Current number of NLBs managed by the controller", + }) + registerer.MustRegister( + podReadinessFlipSeconds, + managedIngressCount, + managedServiceCount, + managedTGBCount, + managedALBCount, + managedNLBCount, + ) return &instruments{ podReadinessFlipSeconds: podReadinessFlipSeconds, + managedIngressCount: managedIngressCount, + managedServiceCount: managedServiceCount, + managedTGBCount: managedTGBCount, + managedALBCount: managedALBCount, + managedNLBCount: managedNLBCount, } } diff --git a/pkg/metrics/lbc/mockcollector.go b/pkg/metrics/lbc/mockcollector.go index 9c8fb6a43a..84d5e74bcc 100644 --- a/pkg/metrics/lbc/mockcollector.go +++ b/pkg/metrics/lbc/mockcollector.go @@ -1,6 +1,7 @@ package lbc import ( + "context" "time" ) @@ -14,22 +15,53 @@ type MockHistogramMetric struct { duration time.Duration } +// ObservePodReadinessGateReady mocks observing the readiness gate latency. func (m *MockCollector) ObservePodReadinessGateReady(namespace string, tgbName string, d time.Duration) { m.recordHistogram(MetricPodReadinessGateReady, namespace, tgbName, d) } +// UpdateManagedK8sResourceMetrics mocks updating managed Kubernetes resource metrics. +func (m *MockCollector) UpdateManagedK8sResourceMetrics(ctx context.Context) error { + m.recordInvocation("UpdateManagedK8sResourceMetrics", ctx) + return nil // No-op for the mock +} + +// UpdateManagedALBMetrics mocks updating managed ALB resource metrics. +func (m *MockCollector) UpdateManagedALBMetrics(ctx context.Context) error { + m.recordInvocation("UpdateManagedALBMetrics", ctx) + return nil // No-op for the mock +} + +// UpdateManagedNLBMetrics mocks updating managed ALB resource metrics. +func (m *MockCollector) UpdateManagedNLBMetrics(ctx context.Context) error { + m.recordInvocation("UpdateManagedALBMetrics", ctx) + return nil // No-op for the mock +} + +// recordHistogram adds a histogram metric invocation. func (m *MockCollector) recordHistogram(metricName string, namespace string, name string, d time.Duration) { - m.Invocations[metricName] = append(m.Invocations[MetricPodReadinessGateReady], MockHistogramMetric{ + if _, exists := m.Invocations[metricName]; !exists { + m.Invocations[metricName] = []interface{}{} + } + m.Invocations[metricName] = append(m.Invocations[metricName], MockHistogramMetric{ namespace: namespace, name: name, duration: d, }) } -func NewMockCollector() MetricCollector { +// recordInvocation tracks a method invocation with arguments. +func (m *MockCollector) recordInvocation(methodName string, args ...interface{}) { + if _, exists := m.Invocations[methodName]; !exists { + m.Invocations[methodName] = []interface{}{} + } + m.Invocations[methodName] = append(m.Invocations[methodName], args) +} +// NewMockCollector creates and returns a new MockCollector. +func NewMockCollector() MetricCollector { mockInvocations := make(map[string][]interface{}) - mockInvocations[MetricPodReadinessGateReady] = make([]interface{}, 0) + mockInvocations[MetricPodReadinessGateReady] = []interface{}{} return &MockCollector{ Invocations: mockInvocations,