From 5cfb124ff45c4f38a79caf0a5cd97ada8b7b74cf Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Mon, 15 Jul 2024 19:26:30 +0200 Subject: [PATCH 1/3] Make ClusterResourceSet controller more predictable --- .../clusterresourceset_controller.go | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/exp/addons/internal/controllers/clusterresourceset_controller.go b/exp/addons/internal/controllers/clusterresourceset_controller.go index 51cfdc4cc535..f796ec2319aa 100644 --- a/exp/addons/internal/controllers/clusterresourceset_controller.go +++ b/exp/addons/internal/controllers/clusterresourceset_controller.go @@ -176,6 +176,27 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R // Return an aggregated error if errors occurred. if len(errs) > 0 { + // When there are more than one ClusterResourceSet targeting the same cluster, + // there might be conflict when reconciling those ClusterResourceSet in parallel because they all try to + // patch the same ClusterResourceSetBinding Object. + // In case of patching conflicts we don't want to go on exponential backlog, otherwise it might take an + // arbitrary long time to get to stable state due to the backoff delay quickly growing. + // Instead, we are requeing with an interval to make the system a little bit more predictable (and stabilize tests). + // NOTE: The fact that we rely on conflict errors + requeue to reach the stable state isn't ideal, and + // it might also become an issue at scale. + // e.g. From an empirical observation, it takes 20s for 10 ClusterResourceSet to get to a stable state + // on the same ClusterResourceSetBinding; with less ClusterResourceSet the issue is less relevant + // (e.g. with 5 ClusterResourceSet it takes about 4 seconds). + // NOTE: Conflicts happens mostly when ClusterResourceSetBinding is initialized / an entry is added for each + // cluster resource set targeting the same cluster. + for _, err := range errs { + if aggregate, ok := err.(kerrors.Aggregate); ok { + if len(aggregate.Errors()) == 1 && apierrors.IsConflict(aggregate.Errors()[0]) { + log.Info("Conflict in patching a ClusterResourceSetBinding that is updated by more than one ClusterResourceSet, requeing") + return ctrl.Result{RequeueAfter: 100 * time.Millisecond}, nil + } + } + } return ctrl.Result{}, kerrors.NewAggregate(errs) } From e1d0a71c149ea0cc19533917b7cf5f4bee580ec9 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Tue, 16 Jul 2024 17:18:34 +0200 Subject: [PATCH 2/3] Improve TestClusterResourceSetReconciler --- .../clusterresourceset_controller_test.go | 51 +++++++++++-------- exp/addons/internal/controllers/suite_test.go | 7 +-- internal/test/envtest/environment.go | 2 +- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/exp/addons/internal/controllers/clusterresourceset_controller_test.go b/exp/addons/internal/controllers/clusterresourceset_controller_test.go index 76a7400dd977..ec982e9128b5 100644 --- a/exp/addons/internal/controllers/clusterresourceset_controller_test.go +++ b/exp/addons/internal/controllers/clusterresourceset_controller_test.go @@ -56,23 +56,7 @@ func TestClusterResourceSetReconciler(t *testing.T) { resourceConfigMapsNamespace = "default" ) - setup := func(t *testing.T, g *WithT) *corev1.Namespace { - t.Helper() - - clusterResourceSetName = fmt.Sprintf("clusterresourceset-%s", util.RandomString(6)) - labels = map[string]string{clusterResourceSetName: "bar"} - - ns, err := env.CreateNamespace(ctx, namespacePrefix) - g.Expect(err).ToNot(HaveOccurred()) - - clusterName = fmt.Sprintf("cluster-%s", util.RandomString(6)) - testCluster = &clusterv1.Cluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns.Name}} - - t.Log("Creating the Cluster") - g.Expect(env.Create(ctx, testCluster)).To(Succeed()) - t.Log("Creating the remote Cluster kubeconfig") - g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) - + createConfigMapAndSecret := func(g Gomega, namespaceName, configmapName, secretName string) { resourceConfigMap1Content := fmt.Sprintf(`metadata: name: %s namespace: %s @@ -82,7 +66,7 @@ apiVersion: v1`, resourceConfigMap1Name, resourceConfigMapsNamespace) testConfigmap := &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: configmapName, - Namespace: ns.Name, + Namespace: namespaceName, }, Data: map[string]string{ "cm": resourceConfigMap1Content, @@ -98,7 +82,7 @@ metadata: testSecret := &corev1.Secret{ ObjectMeta: metav1.ObjectMeta{ Name: secretName, - Namespace: ns.Name, + Namespace: namespaceName, }, Type: "addons.cluster.x-k8s.io/resource-set", StringData: map[string]string{ @@ -108,7 +92,28 @@ metadata: t.Log("Creating a Secret and a ConfigMap with ConfigMap in their data field") g.Expect(env.Create(ctx, testConfigmap)).To(Succeed()) g.Expect(env.Create(ctx, testSecret)).To(Succeed()) + } + setup := func(t *testing.T, g *WithT) *corev1.Namespace { + t.Helper() + + clusterResourceSetName = fmt.Sprintf("clusterresourceset-%s", util.RandomString(6)) + labels = map[string]string{clusterResourceSetName: "bar"} + + ns, err := env.CreateNamespace(ctx, namespacePrefix) + g.Expect(err).ToNot(HaveOccurred()) + + clusterName = fmt.Sprintf("cluster-%s", util.RandomString(6)) + testCluster = &clusterv1.Cluster{ObjectMeta: metav1.ObjectMeta{Name: clusterName, Namespace: ns.Name}} + + t.Log("Creating the Cluster") + g.Expect(env.CreateAndWait(ctx, testCluster)).To(Succeed()) + t.Log("Creating the remote Cluster kubeconfig") + g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) + _, err = tracker.GetClient(ctx, client.ObjectKeyFromObject(testCluster)) + g.Expect(err).ToNot(HaveOccurred()) + + createConfigMapAndSecret(g, ns.Name, configmapName, secretName) return ns } @@ -1031,10 +1036,14 @@ metadata: defer teardown(t, g, ns) t.Log("Creating ClusterResourceSet instances that have same labels as selector") - for range 10 { + for i := range 10 { + configmapName := fmt.Sprintf("%s-%d", configmapName, i) + secretName := fmt.Sprintf("%s-%d", secretName, i) + createConfigMapAndSecret(g, ns.Name, configmapName, secretName) + clusterResourceSetInstance := &addonsv1.ClusterResourceSet{ ObjectMeta: metav1.ObjectMeta{ - Name: fmt.Sprintf("clusterresourceset-%s", util.RandomString(6)), + Name: fmt.Sprintf("clusterresourceset-%d", i), Namespace: ns.Name, }, Spec: addonsv1.ClusterResourceSetSpec{ diff --git a/exp/addons/internal/controllers/suite_test.go b/exp/addons/internal/controllers/suite_test.go index 574fa30c0bfa..bea34b9c1ea6 100644 --- a/exp/addons/internal/controllers/suite_test.go +++ b/exp/addons/internal/controllers/suite_test.go @@ -38,8 +38,9 @@ import ( ) var ( - env *envtest.Environment - ctx = ctrl.SetupSignalHandler() + env *envtest.Environment + tracker *remote.ClusterCacheTracker + ctx = ctrl.SetupSignalHandler() ) func TestMain(m *testing.M) { @@ -76,7 +77,7 @@ func TestMain(m *testing.M) { panic(fmt.Sprintf("Failed to start cache for metadata only Secret watches: %v", err)) } - tracker, err := remote.NewClusterCacheTracker(mgr, remote.ClusterCacheTrackerOptions{}) + tracker, err = remote.NewClusterCacheTracker(mgr, remote.ClusterCacheTrackerOptions{}) if err != nil { panic(fmt.Sprintf("Failed to create new cluster cache tracker: %v", err)) } diff --git a/internal/test/envtest/environment.go b/internal/test/envtest/environment.go index 6e33d91f76ff..2727ff1f7b68 100644 --- a/internal/test/envtest/environment.go +++ b/internal/test/envtest/environment.go @@ -383,7 +383,7 @@ func (e *Environment) waitForWebhooks() { // CreateKubeconfigSecret generates a new Kubeconfig secret from the envtest config. func (e *Environment) CreateKubeconfigSecret(ctx context.Context, cluster *clusterv1.Cluster) error { - return e.Create(ctx, kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(e.Config, cluster))) + return e.CreateAndWait(ctx, kubeconfig.GenerateSecret(cluster, kubeconfig.FromEnvTestConfig(e.Config, cluster))) } // Cleanup deletes all the given objects. From e68d8e745b66d0d3b2da27169dc86d5f57709d89 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Tue, 16 Jul 2024 18:03:32 +0200 Subject: [PATCH 3/3] Address comments --- .../controllers/clusterresourceset_controller.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/exp/addons/internal/controllers/clusterresourceset_controller.go b/exp/addons/internal/controllers/clusterresourceset_controller.go index f796ec2319aa..9f5cac203335 100644 --- a/exp/addons/internal/controllers/clusterresourceset_controller.go +++ b/exp/addons/internal/controllers/clusterresourceset_controller.go @@ -179,20 +179,15 @@ func (r *ClusterResourceSetReconciler) Reconcile(ctx context.Context, req ctrl.R // When there are more than one ClusterResourceSet targeting the same cluster, // there might be conflict when reconciling those ClusterResourceSet in parallel because they all try to // patch the same ClusterResourceSetBinding Object. - // In case of patching conflicts we don't want to go on exponential backlog, otherwise it might take an + // In case of patching conflicts we don't want to go on exponential backoff, otherwise it might take an // arbitrary long time to get to stable state due to the backoff delay quickly growing. - // Instead, we are requeing with an interval to make the system a little bit more predictable (and stabilize tests). - // NOTE: The fact that we rely on conflict errors + requeue to reach the stable state isn't ideal, and - // it might also become an issue at scale. - // e.g. From an empirical observation, it takes 20s for 10 ClusterResourceSet to get to a stable state - // on the same ClusterResourceSetBinding; with less ClusterResourceSet the issue is less relevant - // (e.g. with 5 ClusterResourceSet it takes about 4 seconds). + // Instead, we are requeueing with an interval to make the system a little bit more predictable (and stabilize tests). // NOTE: Conflicts happens mostly when ClusterResourceSetBinding is initialized / an entry is added for each // cluster resource set targeting the same cluster. for _, err := range errs { if aggregate, ok := err.(kerrors.Aggregate); ok { if len(aggregate.Errors()) == 1 && apierrors.IsConflict(aggregate.Errors()[0]) { - log.Info("Conflict in patching a ClusterResourceSetBinding that is updated by more than one ClusterResourceSet, requeing") + log.Info("Conflict in patching a ClusterResourceSetBinding that is updated by more than one ClusterResourceSet, requeueing") return ctrl.Result{RequeueAfter: 100 * time.Millisecond}, nil } }