From 9b9194035b3b47152168ba7ef086abd9855a04ad Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 03:41:17 +0000 Subject: [PATCH 01/12] [tmpnet] Source tmpnet defaults from a configmap Previously the scheduling label and value required to enable exclusive scheduling were defined as flag defaults. To enable the cluster to define these defaults, the defaults are now sourced from a configmap in the target namespace. --- tests/fixture/tmpnet/flags/kube_runtime.go | 14 ++---- tests/fixture/tmpnet/kube_runtime.go | 53 ++++++++++++++++++++++ tests/fixture/tmpnet/network.go | 8 +++- tests/fixture/tmpnet/network_test.go | 2 +- 4 files changed, 65 insertions(+), 12 deletions(-) diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go index cc5304cb7952..3eee81ea99d9 100644 --- a/tests/fixture/tmpnet/flags/kube_runtime.go +++ b/tests/fixture/tmpnet/flags/kube_runtime.go @@ -20,10 +20,9 @@ const ( ) var ( - errKubeNamespaceRequired = errors.New("--kube-namespace is required") - errKubeImageRequired = errors.New("--kube-image is required") - errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) - errKubeSchedulingLabelRequired = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when --kube-use-exclusive-scheduling is enabled") + errKubeNamespaceRequired = errors.New("--kube-namespace is required") + errKubeImageRequired = errors.New("--kube-image is required") + errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) ) type kubeRuntimeVars struct { @@ -77,13 +76,13 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui stringVar( &v.schedulingLabelKey, "kube-scheduling-label-key", - "purpose", + "", kubeDocPrefix+"The label key to use for exclusive scheduling for node selection and toleration", ) stringVar( &v.schedulingLabelValue, "kube-scheduling-label-value", - "higher-spec", + "", kubeDocPrefix+"The label value to use for exclusive scheduling for node selection and toleration", ) } @@ -98,9 +97,6 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } - if v.useExclusiveScheduling && (len(v.schedulingLabelKey) == 0 || len(v.schedulingLabelValue) == 0) { - return nil, errKubeSchedulingLabelRequired - } return &tmpnet.KubeRuntimeConfig{ ConfigPath: v.config.Path, ConfigContext: v.config.Context, diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index c3bb4e72e4ab..c6fb229e20ca 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -21,6 +21,7 @@ import ( "github.com/ava-labs/avalanchego/config" "github.com/ava-labs/avalanchego/ids" + "github.com/ava-labs/avalanchego/utils/logging" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -49,8 +50,13 @@ const ( // are never scheduled to the same nodes. antiAffinityLabelKey = "tmpnet-scheduling" antiAffinityLabelValue = "exclusive" + + // Name of config map containing tmpnet defaults + kubeRuntimeConfigMapName = "tmpnet" ) +var errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") + type KubeRuntimeConfig struct { // Path to the kubeconfig file identifying the target cluster ConfigPath string `json:"configPath,omitempty"` @@ -72,6 +78,53 @@ type KubeRuntimeConfig struct { SchedulingLabelValue string `json:"schedulingLabelValue,omitempty"` } +// ensureDefaults sets cluster-specific defaults for fields not already set by flags. +func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logger) error { + // Only source defaults from the cluster if exclusive scheduling is enabled + if !c.UseExclusiveScheduling { + return nil + } + + clientset, err := GetClientset(log, c.ConfigPath, c.ConfigContext) + if err != nil { + return err + } + + log.Info("attempting to retrieve configmap containing tmpnet defaults", + zap.String("namespace", c.Namespace), + zap.String("configMap", kubeRuntimeConfigMapName), + ) + + configMap, err := clientset.CoreV1().ConfigMaps(c.Namespace).Get(ctx, kubeRuntimeConfigMapName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get ConfigMap: %w", err) + } + + var ( + schedulingLabelKey = configMap.Data["defaultSchedulingLabelKey"] + schedulingLabelValue = configMap.Data["defaultSchedulingLabelValue"] + ) + if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { + log.Info("setting default value for SchedulingLabelKey", + zap.String("schedulingLabelKey", schedulingLabelKey), + ) + c.SchedulingLabelKey = schedulingLabelKey + } + if len(c.SchedulingLabelValue) == 0 && len(schedulingLabelValue) > 0 { + log.Info("setting default value for SchedulingLabelValue", + zap.String("schedulingLabelValue", schedulingLabelValue), + ) + c.SchedulingLabelValue = schedulingLabelValue + } + + // Validate that the scheduling labels are now set + if len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0 { + return errMissingSchedulingLabels + } + + return nil +} + type KubeRuntime struct { node *Node } diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index fd54079049f4..b41c989635fa 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -177,7 +177,7 @@ func BootstrapNewNetwork( if err := checkVMBinaries(log, network.Subnets, network.DefaultRuntimeConfig.Process); err != nil { return err } - if err := network.EnsureDefaultConfig(log); err != nil { + if err := network.EnsureDefaultConfig(ctx, log); err != nil { return err } if err := network.Create(rootNetworkDir); err != nil { @@ -234,7 +234,7 @@ func ReadNetwork(ctx context.Context, log logging.Logger, dir string) (*Network, } // Initializes a new network with default configuration. -func (n *Network) EnsureDefaultConfig(log logging.Logger) error { +func (n *Network) EnsureDefaultConfig(ctx context.Context, log logging.Logger) error { log.Info("preparing configuration for new network", zap.Any("runtimeConfig", n.DefaultRuntimeConfig), ) @@ -281,6 +281,10 @@ func (n *Network) EnsureDefaultConfig(log logging.Logger) error { return errMissingRuntimeConfig } + if n.DefaultRuntimeConfig.Kube != nil { + return n.DefaultRuntimeConfig.Kube.ensureDefaults(ctx, log) + } + return nil } diff --git a/tests/fixture/tmpnet/network_test.go b/tests/fixture/tmpnet/network_test.go index fa789d7e9935..c632b41186b0 100644 --- a/tests/fixture/tmpnet/network_test.go +++ b/tests/fixture/tmpnet/network_test.go @@ -26,7 +26,7 @@ func TestNetworkSerialization(t *testing.T) { network.PrimarySubnetConfig = ConfigMap{ "validatorOnly": true, } - require.NoError(network.EnsureDefaultConfig(logging.NoLog{})) + require.NoError(network.EnsureDefaultConfig(ctx, logging.NoLog{})) require.NoError(network.Create(tmpDir)) // Ensure node runtime is initialized require.NoError(network.readNodes(ctx)) From 313e0c3805c24b6723d402a0db84752c375a55bb Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 04:32:26 +0000 Subject: [PATCH 02/12] fixup: Align with expected configmap --- tests/fixture/tmpnet/kube_runtime.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index c6fb229e20ca..b6095a46caf5 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -80,8 +80,8 @@ type KubeRuntimeConfig struct { // ensureDefaults sets cluster-specific defaults for fields not already set by flags. func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logger) error { - // Only source defaults from the cluster if exclusive scheduling is enabled - if !c.UseExclusiveScheduling { + requireSchedulingDefaults := c.UseExclusiveScheduling && (len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0) + if !requireSchedulingDefaults { return nil } @@ -101,8 +101,8 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg } var ( - schedulingLabelKey = configMap.Data["defaultSchedulingLabelKey"] - schedulingLabelValue = configMap.Data["defaultSchedulingLabelValue"] + schedulingLabelKey = configMap.Data["schedulingLabelKey"] + schedulingLabelValue = configMap.Data["schedulingLabelValue"] ) if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { log.Info("setting default value for SchedulingLabelKey", From ba934d99070ddf6b24bbf334317959cbe59f1310 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 05:14:45 +0000 Subject: [PATCH 03/12] fixup: Set defaults before logging --- tests/fixture/tmpnet/network.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index b41c989635fa..b990ed114393 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -235,6 +235,13 @@ func ReadNetwork(ctx context.Context, log logging.Logger, dir string) (*Network, // Initializes a new network with default configuration. func (n *Network) EnsureDefaultConfig(ctx context.Context, log logging.Logger) error { + // Populate runtime defaults before logging it + if n.DefaultRuntimeConfig.Kube != nil { + if err := n.DefaultRuntimeConfig.Kube.ensureDefaults(ctx, log); err != nil { + return err + } + } + log.Info("preparing configuration for new network", zap.Any("runtimeConfig", n.DefaultRuntimeConfig), ) @@ -281,10 +288,6 @@ func (n *Network) EnsureDefaultConfig(ctx context.Context, log logging.Logger) e return errMissingRuntimeConfig } - if n.DefaultRuntimeConfig.Kube != nil { - return n.DefaultRuntimeConfig.Kube.ensureDefaults(ctx, log) - } - return nil } From 01f7270989b272ace1c66ef8d70ca68e165ae554 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 05:33:18 +0000 Subject: [PATCH 04/12] fixup: Rename the configmap name --- tests/fixture/tmpnet/kube_runtime.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index b6095a46caf5..4f5343ab84ed 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -52,7 +52,7 @@ const ( antiAffinityLabelValue = "exclusive" // Name of config map containing tmpnet defaults - kubeRuntimeConfigMapName = "tmpnet" + defaultsConfigMapName = "tmpnet-defaults" ) var errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") @@ -92,10 +92,10 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg log.Info("attempting to retrieve configmap containing tmpnet defaults", zap.String("namespace", c.Namespace), - zap.String("configMap", kubeRuntimeConfigMapName), + zap.String("configMap", defaultsConfigMapName), ) - configMap, err := clientset.CoreV1().ConfigMaps(c.Namespace).Get(ctx, kubeRuntimeConfigMapName, metav1.GetOptions{}) + configMap, err := clientset.CoreV1().ConfigMaps(c.Namespace).Get(ctx, defaultsConfigMapName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get ConfigMap: %w", err) } From bb11355bb847d227f121e36e36a4fff651c0ef42 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 17 Jun 2025 00:17:21 -0700 Subject: [PATCH 05/12] [tmpnet] Enable externally accessible URIs for kube-hosted nodes Previously, access from outside of a kube cluster was enabled by port-forwarding through the kube API. This approach turned out incompatible with load testing because it greatly limited the rate at which transactions could be sent. The port-forwarding is replaced with nginx+ingress to ensure minimum overhead when running outside of a kube cluster. --- flake.nix | 28 +- scripts/kind-with-registry.sh | 90 ++++++ tests/antithesis/config.go | 8 +- tests/e2e/c/dynamic_fees.go | 2 +- tests/e2e/p/interchain_workflow.go | 2 +- tests/e2e/p/l1.go | 6 +- tests/e2e/p/staking_rewards.go | 4 +- tests/e2e/p/validator_sets.go | 6 +- tests/e2e/vms/xsvm.go | 8 +- tests/e2e/x/transfer/virtuous.go | 11 +- tests/fixture/e2e/env.go | 42 +-- tests/fixture/e2e/helpers.go | 24 -- tests/fixture/tmpnet/flags/kube_runtime.go | 20 +- tests/fixture/tmpnet/kube_runtime.go | 327 +++++++++++++++++++-- tests/fixture/tmpnet/monitor_kube.go | 2 +- tests/fixture/tmpnet/network.go | 16 +- tests/fixture/tmpnet/node.go | 19 +- tests/fixture/tmpnet/process_runtime.go | 7 +- tests/fixture/tmpnet/start_kind_cluster.go | 121 +++++++- tests/fixture/tmpnet/utils.go | 30 +- tests/load/c/main/main.go | 2 +- 21 files changed, 590 insertions(+), 185 deletions(-) create mode 100755 scripts/kind-with-registry.sh diff --git a/flake.nix b/flake.nix index 1bbd86dd45c5..cbd9a812dde7 100644 --- a/flake.nix +++ b/flake.nix @@ -48,7 +48,6 @@ k9s # Kubernetes TUI kind # Kubernetes-in-Docker kubernetes-helm # Helm CLI (Kubernetes package manager) - self.packages.${system}.kind-with-registry # Script installing kind configured with a local registry # Linters shellcheck @@ -65,32 +64,11 @@ # macOS-specific frameworks darwin.apple_sdk.frameworks.Security ]; - }; - }); - - # Package to install the kind-with-registry script - packages = forAllSystems ({ pkgs }: { - kind-with-registry = pkgs.stdenv.mkDerivation { - pname = "kind-with-registry"; - version = "1.0.0"; - src = pkgs.fetchurl { - url = "https://raw.githubusercontent.com/kubernetes-sigs/kind/7cb9e6be25b48a0e248097eef29d496ab1a044d0/site/static/examples/kind-with-registry.sh"; - sha256 = "0gri0x0ygcwmz8l4h6zzsvydw8rsh7qa8p5218d4hncm363i81hv"; - }; - - phases = [ "installPhase" ]; - - installPhase = '' - mkdir -p $out/bin - install -m755 $src $out/bin/kind-with-registry.sh + # Add scripts/ directory to PATH so kind-with-registry.sh is accessible + shellHook = '' + export PATH="$PWD/scripts:$PATH" ''; - - meta = with pkgs.lib; { - description = "Script to set up kind with a local registry"; - license = licenses.mit; - maintainers = with maintainers; [ "maru-ava" ]; - }; }; }); }; diff --git a/scripts/kind-with-registry.sh b/scripts/kind-with-registry.sh new file mode 100755 index 000000000000..5027432722eb --- /dev/null +++ b/scripts/kind-with-registry.sh @@ -0,0 +1,90 @@ +#!/bin/sh +# Based on https://raw.githubusercontent.com/kubernetes-sigs/kind/7cb9e6be25b48a0e248097eef29d496ab1a044d0/site/static/examples/kind-with-registry.sh +# Original work Copyright 2019 The Kubernetes Authors +# Modifications Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +# See the file LICENSE for licensing terms. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO(marun) Migrate this script to golang +set -o errexit + +# 1. Create registry container unless it already exists +reg_name='kind-registry' +reg_port='5001' +if [ "$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" != 'true' ]; then + docker run \ + -d --restart=always -p "127.0.0.1:${reg_port}:5000" --network bridge --name "${reg_name}" \ + registry:2 +fi + +# 2. Create kind cluster with containerd registry config dir enabled +# TODO: kind will eventually enable this by default and this patch will +# be unnecessary. +# +# See: +# https://github.com/kubernetes-sigs/kind/issues/2875 +# https://github.com/containerd/containerd/blob/main/docs/cri/config.md#registry-configuration +# See: https://github.com/containerd/containerd/blob/main/docs/hosts.md +cat <= %d", tmpnet.MinimumVolumeSizeGB) + errKubeNamespaceRequired = errors.New("--kube-namespace is required") + errKubeImageRequired = errors.New("--kube-image is required") + errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) + errKubeBaseAccessibleURIRequired = errors.New("--kube-base-accessible-uri is required when running outside of cluster") ) type kubeRuntimeVars struct { @@ -32,6 +34,7 @@ type kubeRuntimeVars struct { useExclusiveScheduling bool schedulingLabelKey string schedulingLabelValue string + baseAccessibleURI string config *KubeconfigVars } @@ -85,6 +88,12 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui "", kubeDocPrefix+"The label value to use for exclusive scheduling for node selection and toleration", ) + stringVar( + &v.baseAccessibleURI, + "kube-base-accessible-uri", + "http://localhost:30791", + kubeDocPrefix+"The base URI for constructing node URIs when running outside of the cluster hosting nodes", + ) } func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, error) { @@ -97,6 +106,9 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } + if !tmpnet.IsRunningInCluster() && len(v.baseAccessibleURI) == 0 { + return nil, errKubeBaseAccessibleURIRequired + } return &tmpnet.KubeRuntimeConfig{ ConfigPath: v.config.Path, ConfigContext: v.config.Context, @@ -106,5 +118,7 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err UseExclusiveScheduling: v.useExclusiveScheduling, SchedulingLabelKey: v.schedulingLabelKey, SchedulingLabelValue: v.schedulingLabelValue, + // Strip trailing slashes to simplify path composition + BaseAccessibleURI: strings.TrimRight(v.baseAccessibleURI, "/"), }, nil } diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 4f5343ab84ed..63903a191347 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -16,6 +16,7 @@ import ( "go.uber.org/zap" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" @@ -24,6 +25,7 @@ import ( "github.com/ava-labs/avalanchego/utils/logging" corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" restclient "k8s.io/client-go/rest" @@ -76,6 +78,8 @@ type KubeRuntimeConfig struct { SchedulingLabelKey string `json:"schedulingLabelKey,omitempty"` // Label value to use for exclusive scheduling for node selection and toleration SchedulingLabelValue string `json:"schedulingLabelValue,omitempty"` + // Base URI for constructing node URIs when running outside of the cluster hosting nodes (e.g., "http://localhost:30791") + BaseAccessibleURI string `json:"baseAccessibleURI,omitempty"` } // ensureDefaults sets cluster-specific defaults for fields not already set by flags. @@ -145,6 +149,11 @@ func (p *KubeRuntime) readState(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) + // Validate that it will be possible to construct accessible URIs when running external to the kube cluster + if !IsRunningInCluster() && len(runtimeConfig.BaseAccessibleURI) == 0 { + return errors.New("BaseAccessibleURI must be set when running outside of the kubernetes cluster") + } + clientset, err := p.getClientset() if err != nil { return err @@ -187,31 +196,27 @@ func (p *KubeRuntime) readState(ctx context.Context) error { return nil } -// GetLocalURI retrieves a URI for the node intended to be accessible from this -// process until the provided cancel function is called. -func (p *KubeRuntime) GetLocalURI(ctx context.Context) (string, func(), error) { - if len(p.node.URI) == 0 { - // Assume that an empty URI indicates a need to read pod state - if err := p.readState(ctx); err != nil { - return "", func() {}, fmt.Errorf("failed to read Pod state: %w", err) - } - } - - // Use direct pod URI if running inside the cluster +// GetAccessibleURI retrieves a URI for the node accessible from where +// this process is running. If the process is running inside a kube +// cluster, the node and the process will be assumed to be running in the +// same kube cluster and the node's URI be used. If the process is +// running outside of a kube cluster, a URI accessible from outside of +// the cluster will be used. +func (p *KubeRuntime) GetAccessibleURI() string { if IsRunningInCluster() { - return p.node.URI, func() {}, nil + return p.node.URI } - port, stopChan, err := p.forwardPort(ctx, config.DefaultHTTPPort) - if err != nil { - return "", nil, err - } - return fmt.Sprintf("http://127.0.0.1:%d", port), func() { close(stopChan) }, nil + baseURI := p.runtimeConfig().BaseAccessibleURI + nodeID := p.node.NodeID.String() + networkUUID := p.node.network.UUID + + return fmt.Sprintf("%s/networks/%s/%s", baseURI, networkUUID, nodeID) } -// GetLocalStakingAddress retrieves a StakingAddress for the node intended to be +// GetAccessibleStakingAddress retrieves a StakingAddress for the node intended to be // accessible from this process until the provided cancel function is called. -func (p *KubeRuntime) GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { +func (p *KubeRuntime) GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { if p.node.StakingAddress == (netip.AddrPort{}) { // Assume that an empty staking address indicates a need to retrieve pod state if err := p.readState(ctx); err != nil { @@ -364,6 +369,24 @@ func (p *KubeRuntime) Start(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) + // Create Service for the node (prefix with 's-' for DNS compatibility) + serviceName := "s-" + statefulSetName + if err := p.createNodeService(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Service for node: %w", err) + } + + // Create Ingress for the node + if err := p.createNodeIngress(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Ingress for node: %w", err) + } + + // Wait for ingress to be ready if running outside cluster + if !IsRunningInCluster() { + if err := p.waitForIngressReadiness(ctx, serviceName); err != nil { + return fmt.Errorf("failed to wait for Ingress readiness: %w", err) + } + } + return p.ensureBootstrapIP(ctx) } @@ -624,9 +647,6 @@ func (p *KubeRuntime) Restart(ctx context.Context) error { } // IsHealthy checks if the node is running and healthy. -// -// TODO(marun) Add WaitForHealthy as a runtime method to minimize API calls required and -// enable reuse of forwarded connection when running external to the kubernetes cluster func (p *KubeRuntime) IsHealthy(ctx context.Context) (bool, error) { err := p.readState(ctx) if err != nil { @@ -636,13 +656,7 @@ func (p *KubeRuntime) IsHealthy(ctx context.Context) (bool, error) { return false, errNotRunning } - uri, cancel, err := p.GetLocalURI(ctx) - if err != nil { - return false, err - } - defer cancel() - - healthReply, err := CheckNodeHealth(ctx, uri) + healthReply, err := CheckNodeHealth(ctx, p.GetAccessibleURI()) if errors.Is(err, ErrUnrecoverableNodeHealthCheck) { return false, err } else if err != nil { @@ -890,6 +904,261 @@ func configureExclusiveScheduling(template *corev1.PodTemplateSpec, labelKey str } } +// createNodeService creates a Kubernetes Service for the node to enable ingress routing +func (p *KubeRuntime) createNodeService(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + ) + + log.Debug("creating Service for node", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + service := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: namespace, + Labels: map[string]string{ + "app": serviceName, + "network-uuid": p.node.network.UUID, + "node-id": nodeID, + }, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + "network_uuid": p.node.network.UUID, + "node_id": nodeID, + }, + Ports: []corev1.ServicePort{ + { + Name: "http", + Port: config.DefaultHTTPPort, + TargetPort: intstr.FromInt(config.DefaultHTTPPort), + Protocol: corev1.ProtocolTCP, + }, + }, + Type: corev1.ServiceTypeClusterIP, + }, + } + + _, err = clientset.CoreV1().Services(namespace).Create(ctx, service, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Service: %w", err) + } + + log.Debug("created Service", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + return nil +} + +// createNodeIngress creates a Kubernetes Ingress for the node to enable external access +func (p *KubeRuntime) createNodeIngress(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + networkUUID = p.node.network.UUID + ) + + log.Debug("creating Ingress for node", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + var ( + ingressClassName = "nginx" // Assume nginx ingress controller + // Path pattern: /networks//(/|$)(.*) + // Using (/|$)(.*) to properly handle trailing slashes + pathPattern = fmt.Sprintf("/networks/%s/%s", networkUUID, nodeID) + "(/|$)(.*)" + pathType = networkingv1.PathTypeImplementationSpecific + ) + + ingress := &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: namespace, + Labels: map[string]string{ + "app": serviceName, + "network-uuid": networkUUID, + "node-id": nodeID, + }, + Annotations: map[string]string{ + "nginx.ingress.kubernetes.io/use-regex": "true", + "nginx.ingress.kubernetes.io/rewrite-target": "/$2", + "nginx.ingress.kubernetes.io/proxy-body-size": "0", + "nginx.ingress.kubernetes.io/proxy-read-timeout": "600", + "nginx.ingress.kubernetes.io/proxy-send-timeout": "600", + }, + }, + Spec: networkingv1.IngressSpec{ + IngressClassName: &ingressClassName, + Rules: []networkingv1.IngressRule{ + { + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{ + { + Path: pathPattern, + PathType: &pathType, + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: serviceName, + Port: networkingv1.ServiceBackendPort{ + Number: config.DefaultHTTPPort, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + _, err = clientset.NetworkingV1().Ingresses(namespace).Create(ctx, ingress, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Ingress: %w", err) + } + + log.Debug("created Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + zap.String("path", pathPattern), + ) + + return nil +} + +// waitForIngressReadiness waits for the ingress to be ready and able to route traffic +// This prevents 503 errors when health checks are performed immediately after node start +func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + ) + + log.Debug("waiting for Ingress readiness", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + // Wait for the ingress to exist and service endpoints to be available + err = wait.PollUntilContextCancel( + ctx, + statusCheckInterval, + true, // immediate + func(ctx context.Context) (bool, error) { + // Check if ingress exists + _, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + log.Debug("waiting for Ingress to be created", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + if err != nil { + log.Warn("failed to retrieve Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + zap.Error(err), + ) + return false, nil + } + + // Check if service endpoints are available + endpoints, err := clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + log.Debug("waiting for Service endpoints to be created", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + return false, nil + } + if err != nil { + log.Warn("failed to retrieve Service endpoints", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + zap.Error(err), + ) + return false, nil + } + + // Check if endpoints have at least one ready address + hasReadyEndpoints := false + for _, subset := range endpoints.Subsets { + if len(subset.Addresses) > 0 { + hasReadyEndpoints = true + break + } + } + + if !hasReadyEndpoints { + log.Debug("waiting for Service endpoints to have ready addresses", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + return false, nil + } + + log.Debug("Ingress and Service endpoints are ready", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return true, nil + }, + ) + if err != nil { + return fmt.Errorf("failed to wait for Ingress %s/%s readiness: %w", namespace, serviceName, err) + } + + log.Debug("Ingress is ready", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + + return nil +} + // IsRunningInCluster detects if this code is running inside a Kubernetes cluster // by checking for the presence of the service account token that's automatically // mounted in every pod. diff --git a/tests/fixture/tmpnet/monitor_kube.go b/tests/fixture/tmpnet/monitor_kube.go index 1bd5cc11cbf2..9128e55c80c9 100644 --- a/tests/fixture/tmpnet/monitor_kube.go +++ b/tests/fixture/tmpnet/monitor_kube.go @@ -37,7 +37,7 @@ type kubeCollectorConfig struct { } // DeployKubeCollectors deploys collectors of logs and metrics to a Kubernetes cluster. -func DeployKubeCollectors( +func deployKubeCollectors( ctx context.Context, log logging.Logger, configPath string, diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index b990ed114393..ecbe1e18de2b 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -449,11 +449,7 @@ func (n *Network) Bootstrap(ctx context.Context, log logging.Logger) error { } // Don't restart the node during subnet creation since it will always be restarted afterwards. - uri, cancel, err := bootstrapNode.GetLocalURI(ctx) - if err != nil { - return err - } - defer cancel() + uri := bootstrapNode.GetAccessibleURI() if err := n.CreateSubnets(ctx, log, uri, false /* restartRequired */); err != nil { return err } @@ -784,11 +780,9 @@ func (n *Network) GetNode(nodeID ids.NodeID) (*Node, error) { return nil, fmt.Errorf("%s is not known to the network", nodeID) } -// GetNodeURIs returns the URIs of nodes in the network that are running and not ephemeral. The URIs -// returned are guaranteed be reachable by the caller until the cleanup function is called regardless -// of whether the nodes are running as local processes or in a kube cluster. -func (n *Network) GetNodeURIs(ctx context.Context, deferCleanupFunc func(func())) ([]NodeURI, error) { - return GetNodeURIs(ctx, n.Nodes, deferCleanupFunc) +// GetNodeURIs returns the accessible URIs of nodes in the network that are running and not ephemeral. +func (n *Network) GetNodeURIs() []NodeURI { + return GetNodeURIs(n.Nodes) } // GetAvailableNodeIDs returns the node IDs of nodes in the network that are running and not ephemeral. @@ -969,7 +963,7 @@ func waitForHealthy(ctx context.Context, log logging.Logger, nodes []*Node) erro unhealthyNodes.Remove(node) log.Info("node is healthy", zap.Stringer("nodeID", node.NodeID), - zap.String("uri", node.URI), + zap.String("uri", node.GetAccessibleURI()), ) } diff --git a/tests/fixture/tmpnet/node.go b/tests/fixture/tmpnet/node.go index a76eb02a7e44..347253356b68 100644 --- a/tests/fixture/tmpnet/node.go +++ b/tests/fixture/tmpnet/node.go @@ -41,8 +41,8 @@ var ( // NodeRuntime defines the methods required to support running a node. type NodeRuntime interface { readState(ctx context.Context) error - GetLocalURI(ctx context.Context) (string, func(), error) - GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) + GetAccessibleURI() string + GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) Start(ctx context.Context) error InitiateStop(ctx context.Context) error WaitForStopped(ctx context.Context) error @@ -199,12 +199,12 @@ func (n *Node) readState(ctx context.Context) error { return n.getRuntime().readState(ctx) } -func (n *Node) GetLocalURI(ctx context.Context) (string, func(), error) { - return n.getRuntime().GetLocalURI(ctx) +func (n *Node) GetAccessibleURI() string { + return n.getRuntime().GetAccessibleURI() } -func (n *Node) GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { - return n.getRuntime().GetLocalStakingAddress(ctx) +func (n *Node) GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { + return n.getRuntime().GetAccessibleStakingAddress(ctx) } // Writes the current state of the metrics endpoint to disk @@ -213,12 +213,7 @@ func (n *Node) SaveMetricsSnapshot(ctx context.Context) error { // No URI to request metrics from return nil } - baseURI, cancel, err := n.GetLocalURI(ctx) - if err != nil { - return nil - } - defer cancel() - uri := baseURI + "/ext/metrics" + uri := n.GetAccessibleURI() + "/ext/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, uri, nil) if err != nil { return err diff --git a/tests/fixture/tmpnet/process_runtime.go b/tests/fixture/tmpnet/process_runtime.go index 9088f0928210..94d7154c3d59 100644 --- a/tests/fixture/tmpnet/process_runtime.go +++ b/tests/fixture/tmpnet/process_runtime.go @@ -417,11 +417,12 @@ func (p *ProcessRuntime) writeMonitoringConfigFile(name string, config []ConfigM return nil } -func (p *ProcessRuntime) GetLocalURI(_ context.Context) (string, func(), error) { - return p.node.URI, func() {}, nil +// GetAccessibleURI returns the URI that can be used to access the node's API. +func (p *ProcessRuntime) GetAccessibleURI() string { + return p.node.URI } -func (p *ProcessRuntime) GetLocalStakingAddress(_ context.Context) (netip.AddrPort, func(), error) { +func (p *ProcessRuntime) GetAccessibleStakingAddress(_ context.Context) (netip.AddrPort, func(), error) { return p.node.StakingAddress, func() {}, nil } diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index 1957005c4d21..4b49c179fcd6 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -13,6 +13,7 @@ import ( "strings" "go.uber.org/zap" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" @@ -37,6 +38,13 @@ const ( // TODO(marun) Check for the presence of the context rather than string matching on this error missingContextMsg = `context "` + KindKubeconfigContext + `" does not exist` + + // Ingress controller constants + ingressNamespace = "ingress-nginx" + ingressReleaseName = "ingress-nginx" + ingressChartRepo = "https://kubernetes.github.io/ingress-nginx" + ingressChartName = "ingress-nginx/ingress-nginx" + ingressControllerName = "ingress-nginx-controller" ) //go:embed yaml/tmpnet-rbac.yaml @@ -96,10 +104,14 @@ func StartKindCluster( return fmt.Errorf("failed to create service account kubeconfig context: %w", err) } - if err := DeployKubeCollectors(ctx, log, configPath, configContext, startMetricsCollector, startLogsCollector); err != nil { + if err := deployKubeCollectors(ctx, log, configPath, configContext, startMetricsCollector, startLogsCollector); err != nil { return fmt.Errorf("failed to deploy kube collectors: %w", err) } + if err := deployIngressController(ctx, log, configPath, configContext); err != nil { + return fmt.Errorf("failed to deploy ingress controller: %w", err) + } + return nil } @@ -285,3 +297,110 @@ func createServiceAccountKubeconfig( return nil } + +// deployIngressController deploys the nginx ingress controller using Helm. +func deployIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + log.Info("checking if nginx ingress controller is already running") + + isRunning, err := isIngressControllerRunning(ctx, log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to check nginx ingress controller status: %w", err) + } + if isRunning { + log.Info("nginx ingress controller already running") + return nil + } + + log.Info("deploying nginx ingress controller using Helm") + + // Add the helm repo for ingress-nginx + if err := runHelmCommand(ctx, "repo", "add", "ingress-nginx", ingressChartRepo); err != nil { + return fmt.Errorf("failed to add helm repo: %w", err) + } + if err := runHelmCommand(ctx, "repo", "update"); err != nil { + return fmt.Errorf("failed to update helm repos: %w", err) + } + + // Install nginx-ingress with values set directly via flags + // Using fixed nodePort 30791 for cross-platform compatibility + args := []string{ + "install", + ingressReleaseName, + ingressChartName, + "--namespace", ingressNamespace, + "--create-namespace", + "--wait", + "--set", "controller.service.type=NodePort", + // This port value must match the port configured in scripts/kind-with-registry.sh + "--set", "controller.service.nodePorts.http=30791", + "--set", "controller.admissionWebhooks.enabled=false", + "--set", "controller.config.proxy-read-timeout=600", + "--set", "controller.config.proxy-send-timeout=600", + "--set", "controller.config.proxy-body-size=0", + "--set", "controller.config.proxy-http-version=1.1", + "--set", "controller.metrics.enabled=true", + } + + if err := runHelmCommand(ctx, args...); err != nil { + return fmt.Errorf("failed to install nginx-ingress: %w", err) + } + + return waitForIngressController(ctx, log, configPath, configContext) +} + +// isIngressControllerRunning checks if the nginx ingress controller is already running. +func isIngressControllerRunning(ctx context.Context, log logging.Logger, configPath string, configContext string) (bool, error) { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return false, err + } + + // TODO(marun) Handle the case of the deployment being in a failed state + _, err = clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{}) + isRunning := !apierrors.IsNotFound(err) || err == nil + return isRunning, nil +} + +// waitForIngressController waits for the nginx ingress controller to be ready. +func waitForIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to get clientset: %w", err) + } + + return wait.PollUntilContextCancel(ctx, statusCheckInterval, true /* immediate */, func(ctx context.Context) (bool, error) { + deployment, err := clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{}) + if err != nil { + log.Debug("failed to get nginx ingress controller deployment", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Error(err), + ) + return false, nil + } + if deployment.Status.ReadyReplicas == 0 { + log.Debug("waiting for nginx ingress controller to become ready", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + zap.Int32("replicas", deployment.Status.Replicas), + ) + return false, nil + } + + log.Info("nginx ingress controller is ready", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + ) + return true, nil + }) +} + +// runHelmCommand runs a Helm command with the given arguments. +func runHelmCommand(ctx context.Context, args ...string) error { + cmd := exec.CommandContext(ctx, "helm", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} diff --git a/tests/fixture/tmpnet/utils.go b/tests/fixture/tmpnet/utils.go index e80070326900..121fe6a062bb 100644 --- a/tests/fixture/tmpnet/utils.go +++ b/tests/fixture/tmpnet/utils.go @@ -44,6 +44,14 @@ func CheckNodeHealth(ctx context.Context, uri string) (*health.APIReply, error) return nil, err } } + + // Assume `503 Service Unavailable` is the result of the ingress + // for the node not being ready. + // TODO(marun) Update Client.Health() to return a typed error + if err != nil && err.Error() == "received status code: 503" { + return nil, err + } + // Assume all other errors are not recoverable return nil, fmt.Errorf("%w: %w", ErrUnrecoverableNodeHealthCheck, err) } @@ -54,25 +62,18 @@ type NodeURI struct { URI string } -// GetNodeURIs returns the URIs of the provided nodes that are running and not ephemeral. The URIs returned -// are guaranteed be reachable by the caller until the cleanup function is called regardless of whether the -// nodes are running as local processes or in a kube cluster. -func GetNodeURIs(ctx context.Context, nodes []*Node, deferCleanupFunc func(func())) ([]NodeURI, error) { +// GetNodeURIs returns the accessible URIs of the provided nodes that are running and not ephemeral. +func GetNodeURIs(nodes []*Node) []NodeURI { availableNodes := FilterAvailableNodes(nodes) uris := []NodeURI{} for _, node := range availableNodes { - uri, cancel, err := node.GetLocalURI(ctx) - if err != nil { - return nil, err - } - deferCleanupFunc(cancel) uris = append(uris, NodeURI{ NodeID: node.NodeID, - URI: uri, + URI: node.GetAccessibleURI(), }) } - return uris, nil + return uris } // FilteredAvailableNodes filters the provided nodes by whether they are running and not ephemeral. @@ -96,15 +97,10 @@ func FilterAvailableNodes(nodes []*Node) []*Node { // blockchain ID, in the form "ws:///ext/bc//ws". // Ephemeral and stopped nodes are ignored. func GetNodeWebsocketURIs( - ctx context.Context, nodes []*Node, blockchainID string, - deferCleanupFunc func(func()), ) ([]string, error) { - nodeURIs, err := GetNodeURIs(ctx, nodes, deferCleanupFunc) - if err != nil { - return nil, fmt.Errorf("failed to get node URIs: %w", err) - } + nodeURIs := GetNodeURIs(nodes) wsURIs := make([]string, len(nodeURIs)) for i := range nodeURIs { uri, err := url.Parse(nodeURIs[i].URI) diff --git a/tests/load/c/main/main.go b/tests/load/c/main/main.go index 676293d173af..fdee7b7eaa42 100644 --- a/tests/load/c/main/main.go +++ b/tests/load/c/main/main.go @@ -88,7 +88,7 @@ func main() { ) }) - endpoints, err := tmpnet.GetNodeWebsocketURIs(ctx, network.Nodes, blockchainID, tc.DeferCleanup) + endpoints, err := tmpnet.GetNodeWebsocketURIs(network.Nodes, blockchainID) require.NoError(err, "failed †o get node websocket URIs") w := &workload{ From 52b06e13779661b6e989c498eb53ec12bf801bc3 Mon Sep 17 00:00:00 2001 From: maru Date: Thu, 26 Jun 2025 18:17:21 +0000 Subject: [PATCH 06/12] fixup: Update RBAC to support ingress deployment --- tests/fixture/tmpnet/kube_runtime.go | 6 +++--- tests/fixture/tmpnet/start_kind_cluster.go | 12 +++++++++--- tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml | 11 +++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 63903a191347..5d808c8dd74a 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -1083,7 +1083,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s // Check if ingress exists _, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { - log.Debug("waiting for Ingress to be created", + log.Verbo("waiting for Ingress to be created", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("ingress", serviceName), @@ -1103,7 +1103,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s // Check if service endpoints are available endpoints, err := clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { - log.Debug("waiting for Service endpoints to be created", + log.Verbo("waiting for Service endpoints to be created", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("service", serviceName), @@ -1130,7 +1130,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s } if !hasReadyEndpoints { - log.Debug("waiting for Service endpoints to have ready addresses", + log.Verbo("waiting for Service endpoints to have ready addresses", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("service", serviceName), diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index 4b49c179fcd6..cad7e186da39 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -239,13 +239,18 @@ func createServiceAccountKubeconfig( return fmt.Errorf("failed to load kubeconfig: %w", err) } - // Check if the context already exists if _, exists := config.Contexts[newContextName]; exists { - log.Info("service account kubeconfig context already exists", + log.Info("service account kubeconfig context exists, recreating to ensure consistency with cluster state", + zap.String("kubeconfig", configPath), + zap.String("context", newContextName), + zap.String("namespace", namespace), + ) + } else { + log.Info("creating new service account kubeconfig context", + zap.String("kubeconfig", configPath), zap.String("context", newContextName), zap.String("namespace", namespace), ) - return nil } // Get the current context (already verified to exist by StartKindCluster) @@ -291,6 +296,7 @@ func createServiceAccountKubeconfig( } log.Info("created service account kubeconfig context", + zap.String("kubeconfig", configPath), zap.String("context", newContextName), zap.String("namespace", namespace), ) diff --git a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml index 0d3056614a85..e7f230a72515 100644 --- a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml +++ b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml @@ -11,6 +11,7 @@ metadata: name: tmpnet namespace: tmpnet rules: +# Regular usage - apiGroups: ["apps"] resources: ["statefulsets"] verbs: ["get", "create", "update", "patch"] @@ -23,6 +24,16 @@ rules: - apiGroups: [""] resources: ["pods/portforward"] verbs: ["create"] +# Enable external node access via ingress +- apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "create"] +- apiGroups: [""] + resources: ["endpoints"] + verbs: ["get"] +- apiGroups: [""] + resources: ["services"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding From 540cdf0190b337d4a513a59315661152240cc9d8 Mon Sep 17 00:00:00 2001 From: maru Date: Thu, 26 Jun 2025 18:24:36 +0000 Subject: [PATCH 07/12] fixup: Avoid deploying service or ingress when inside the cluster --- tests/fixture/tmpnet/kube_runtime.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 5d808c8dd74a..46dc1c641ee0 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -369,19 +369,18 @@ func (p *KubeRuntime) Start(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) - // Create Service for the node (prefix with 's-' for DNS compatibility) - serviceName := "s-" + statefulSetName - if err := p.createNodeService(ctx, serviceName); err != nil { - return fmt.Errorf("failed to create Service for node: %w", err) - } + if !IsRunningInCluster() { + // If running outside the cluster, ensure the node's API port is accessible via ingress - // Create Ingress for the node - if err := p.createNodeIngress(ctx, serviceName); err != nil { - return fmt.Errorf("failed to create Ingress for node: %w", err) - } + serviceName := "s-" + statefulSetName // The 's-' prefix ensures DNS compatibility + if err := p.createNodeService(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Service for node: %w", err) + } + + if err := p.createNodeIngress(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Ingress for node: %w", err) + } - // Wait for ingress to be ready if running outside cluster - if !IsRunningInCluster() { if err := p.waitForIngressReadiness(ctx, serviceName); err != nil { return fmt.Errorf("failed to wait for Ingress readiness: %w", err) } From 1888db3160ae5f1b5dc5a74dd077eefde00a3d76 Mon Sep 17 00:00:00 2001 From: maru Date: Fri, 27 Jun 2025 01:01:30 +0000 Subject: [PATCH 08/12] fixup: Avoid deploying to non-kind cluster without base accessible uri --- .github/workflows/ci.yml | 1 + scripts/tests.e2e.kube.sh | 15 ++++++++- tests/fixture/tmpnet/flags/kube_runtime.go | 14 ++++++-- tests/fixture/tmpnet/kube_runtime.go | 39 +++++++++++++++++++--- tests/log.go | 3 +- 5 files changed, 63 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 31a41fc8f490..02cec5619ad3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -255,6 +255,7 @@ jobs: - uses: ./.github/actions/run-monitored-tmpnet-cmd with: run: ./scripts/run_task.sh test-load-kube-kind + runtime: kube artifact_prefix: load-kube prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }} prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }} diff --git a/scripts/tests.e2e.kube.sh b/scripts/tests.e2e.kube.sh index 11c16104e7ae..d34067ed03dc 100755 --- a/scripts/tests.e2e.kube.sh +++ b/scripts/tests.e2e.kube.sh @@ -23,4 +23,17 @@ else XSVM_IMAGE="${XSVM_IMAGE}" AVALANCHEGO_IMAGE="${AVALANCHEGO_IMAGE}" bash -x ./scripts/build_xsvm_image.sh fi -bash -x ./scripts/tests.e2e.sh --runtime=kube --kube-image="${XSVM_IMAGE}" "$@" +# Determine kubeconfig context to use +KUBECONFIG_CONTEXT="" + +# Check if --kubeconfig-context is already provided in arguments +if [[ "$*" =~ --kubeconfig-context ]]; then + # User provided a context, use it as-is + echo "Using provided kubeconfig context from arguments" +else + # Default to the RBAC context + KUBECONFIG_CONTEXT="--kubeconfig-context=kind-kind-tmpnet" + echo "Defaulting to limited-permission context 'kind-kind-tmpnet' to test RBAC Role permissions" +fi + +bash -x ./scripts/tests.e2e.sh --runtime=kube --kube-image="${XSVM_IMAGE}" "$KUBECONFIG_CONTEXT" "$@" diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go index af43e9040363..067104cf733d 100644 --- a/tests/fixture/tmpnet/flags/kube_runtime.go +++ b/tests/fixture/tmpnet/flags/kube_runtime.go @@ -91,7 +91,7 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui stringVar( &v.baseAccessibleURI, "kube-base-accessible-uri", - "http://localhost:30791", + "", kubeDocPrefix+"The base URI for constructing node URIs when running outside of the cluster hosting nodes", ) } @@ -106,7 +106,15 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } - if !tmpnet.IsRunningInCluster() && len(v.baseAccessibleURI) == 0 { + baseAccessibleURI := v.baseAccessibleURI + if strings.HasPrefix(v.config.Context, "kind-kind") && len(baseAccessibleURI) == 0 { + // Use the base uri expected for the kind cluster deployed by tmpnet. Not supplying this as a default + // ensures that an explicit value is required for non-kind clusters. + // + // TODO(marun) Log why this value is being used. This will require passing a log through the call chain. + baseAccessibleURI = "http://localhost:30791" + } + if !tmpnet.IsRunningInCluster() && len(baseAccessibleURI) == 0 { return nil, errKubeBaseAccessibleURIRequired } return &tmpnet.KubeRuntimeConfig{ @@ -119,6 +127,6 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err SchedulingLabelKey: v.schedulingLabelKey, SchedulingLabelValue: v.schedulingLabelValue, // Strip trailing slashes to simplify path composition - BaseAccessibleURI: strings.TrimRight(v.baseAccessibleURI, "/"), + BaseAccessibleURI: strings.TrimRight(baseAccessibleURI, "/"), }, nil } diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 46dc1c641ee0..e73add3e64fb 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -1073,14 +1073,14 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s return err } - // Wait for the ingress to exist and service endpoints to be available + // Wait for the ingress to exist, be processed by the controller, and service endpoints to be available err = wait.PollUntilContextCancel( ctx, statusCheckInterval, true, // immediate func(ctx context.Context) (bool, error) { - // Check if ingress exists - _, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + // Check if ingress exists and is processed by the controller + ingress, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { log.Verbo("waiting for Ingress to be created", zap.String("nodeID", nodeID), @@ -1099,6 +1099,37 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s return false, nil } + // Check if ingress controller has processed the ingress + // The ingress controller should populate the Status.LoadBalancer.Ingress field + // when it has successfully processed and exposed the ingress + hasIngressIP := len(ingress.Status.LoadBalancer.Ingress) > 0 + if !hasIngressIP { + log.Verbo("waiting for Ingress controller to process and expose the Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + + // Validate that at least one ingress has an IP or hostname + hasValidIngress := false + for _, ing := range ingress.Status.LoadBalancer.Ingress { + if ing.IP != "" || ing.Hostname != "" { + hasValidIngress = true + break + } + } + + if !hasValidIngress { + log.Verbo("waiting for Ingress controller to assign IP or hostname", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + // Check if service endpoints are available endpoints, err := clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { @@ -1137,7 +1168,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s return false, nil } - log.Debug("Ingress and Service endpoints are ready", + log.Debug("Ingress is exposed by controller and Service endpoints are ready", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("ingress", serviceName), diff --git a/tests/log.go b/tests/log.go index a431feb3c1ec..134cc9cd1f78 100644 --- a/tests/log.go +++ b/tests/log.go @@ -25,5 +25,6 @@ func LoggerForFormat(prefix string, rawLogFormat string) (logging.Logger, error) if err != nil { return nil, err } - return logging.NewLogger(prefix, logging.NewWrappedCore(logging.Verbo, writeCloser, logFormat.ConsoleEncoder())), nil + // TODO(marun) Make the log level configurable + return logging.NewLogger(prefix, logging.NewWrappedCore(logging.Debug, writeCloser, logFormat.ConsoleEncoder())), nil } From 9f2525562617b58b9633b50ec5dc1bb360a1ec80 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 1 Jul 2025 03:20:45 +0000 Subject: [PATCH 09/12] fixup: Source ingress configuration from the cluster --- tests/fixture/tmpnet/flags/kube_runtime.go | 28 +--- tests/fixture/tmpnet/kube_runtime.go | 152 ++++++++++++++------- tests/fixture/tmpnet/start_kind_cluster.go | 49 +++++++ tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml | 3 + tests/load2/main/main.go | 2 +- 5 files changed, 159 insertions(+), 75 deletions(-) diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go index 067104cf733d..3eee81ea99d9 100644 --- a/tests/fixture/tmpnet/flags/kube_runtime.go +++ b/tests/fixture/tmpnet/flags/kube_runtime.go @@ -7,7 +7,6 @@ import ( "errors" "flag" "fmt" - "strings" "github.com/spf13/pflag" @@ -21,10 +20,9 @@ const ( ) var ( - errKubeNamespaceRequired = errors.New("--kube-namespace is required") - errKubeImageRequired = errors.New("--kube-image is required") - errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) - errKubeBaseAccessibleURIRequired = errors.New("--kube-base-accessible-uri is required when running outside of cluster") + errKubeNamespaceRequired = errors.New("--kube-namespace is required") + errKubeImageRequired = errors.New("--kube-image is required") + errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) ) type kubeRuntimeVars struct { @@ -34,7 +32,6 @@ type kubeRuntimeVars struct { useExclusiveScheduling bool schedulingLabelKey string schedulingLabelValue string - baseAccessibleURI string config *KubeconfigVars } @@ -88,12 +85,6 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui "", kubeDocPrefix+"The label value to use for exclusive scheduling for node selection and toleration", ) - stringVar( - &v.baseAccessibleURI, - "kube-base-accessible-uri", - "", - kubeDocPrefix+"The base URI for constructing node URIs when running outside of the cluster hosting nodes", - ) } func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, error) { @@ -106,17 +97,6 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } - baseAccessibleURI := v.baseAccessibleURI - if strings.HasPrefix(v.config.Context, "kind-kind") && len(baseAccessibleURI) == 0 { - // Use the base uri expected for the kind cluster deployed by tmpnet. Not supplying this as a default - // ensures that an explicit value is required for non-kind clusters. - // - // TODO(marun) Log why this value is being used. This will require passing a log through the call chain. - baseAccessibleURI = "http://localhost:30791" - } - if !tmpnet.IsRunningInCluster() && len(baseAccessibleURI) == 0 { - return nil, errKubeBaseAccessibleURIRequired - } return &tmpnet.KubeRuntimeConfig{ ConfigPath: v.config.Path, ConfigContext: v.config.Context, @@ -126,7 +106,5 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err UseExclusiveScheduling: v.useExclusiveScheduling, SchedulingLabelKey: v.schedulingLabelKey, SchedulingLabelValue: v.schedulingLabelValue, - // Strip trailing slashes to simplify path composition - BaseAccessibleURI: strings.TrimRight(baseAccessibleURI, "/"), }, nil } diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index e73add3e64fb..1c2eb7f168b8 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -55,9 +55,13 @@ const ( // Name of config map containing tmpnet defaults defaultsConfigMapName = "tmpnet-defaults" + ingressHostKey = "ingressHost" ) -var errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") +var ( + errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") + errMissingIngressHost = errors.New("IngressHost is a required value. Ensure the " + defaultsConfigMapName + " ConfigMap contains an entry for " + ingressHostKey) +) type KubeRuntimeConfig struct { // Path to the kubeconfig file identifying the target cluster @@ -78,14 +82,18 @@ type KubeRuntimeConfig struct { SchedulingLabelKey string `json:"schedulingLabelKey,omitempty"` // Label value to use for exclusive scheduling for node selection and toleration SchedulingLabelValue string `json:"schedulingLabelValue,omitempty"` - // Base URI for constructing node URIs when running outside of the cluster hosting nodes (e.g., "http://localhost:30791") - BaseAccessibleURI string `json:"baseAccessibleURI,omitempty"` + // Host for ingress rules (e.g., "localhost:30791" for kind, "tmpnet.example.com" for EKS) + IngressHost string `json:"ingressHost,omitempty"` + // TLS secret name for ingress (empty for HTTP, populated for HTTPS) + IngressSecret string `json:"ingressSecret,omitempty"` } // ensureDefaults sets cluster-specific defaults for fields not already set by flags. func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logger) error { + // Only read defaults if necessary requireSchedulingDefaults := c.UseExclusiveScheduling && (len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0) - if !requireSchedulingDefaults { + requireIngressDefaults := !IsRunningInCluster() && len(c.IngressHost) == 0 + if !requireSchedulingDefaults && !requireIngressDefaults { return nil } @@ -104,26 +112,47 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg return fmt.Errorf("failed to get ConfigMap: %w", err) } - var ( - schedulingLabelKey = configMap.Data["schedulingLabelKey"] - schedulingLabelValue = configMap.Data["schedulingLabelValue"] - ) - if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { - log.Info("setting default value for SchedulingLabelKey", - zap.String("schedulingLabelKey", schedulingLabelKey), + if requireSchedulingDefaults { + var ( + schedulingLabelKey = configMap.Data["schedulingLabelKey"] + schedulingLabelValue = configMap.Data["schedulingLabelValue"] ) - c.SchedulingLabelKey = schedulingLabelKey + if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { + log.Info("setting default value for SchedulingLabelKey", + zap.String("schedulingLabelKey", schedulingLabelKey), + ) + c.SchedulingLabelKey = schedulingLabelKey + } + if len(c.SchedulingLabelValue) == 0 && len(schedulingLabelValue) > 0 { + log.Info("setting default value for SchedulingLabelValue", + zap.String("schedulingLabelValue", schedulingLabelValue), + ) + c.SchedulingLabelValue = schedulingLabelValue + } + if len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0 { + return errMissingSchedulingLabels + } } - if len(c.SchedulingLabelValue) == 0 && len(schedulingLabelValue) > 0 { - log.Info("setting default value for SchedulingLabelValue", - zap.String("schedulingLabelValue", schedulingLabelValue), + if requireIngressDefaults { + var ( + ingressHost = configMap.Data[ingressHostKey] + ingressSecret = configMap.Data["ingressSecret"] ) - c.SchedulingLabelValue = schedulingLabelValue - } - - // Validate that the scheduling labels are now set - if len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0 { - return errMissingSchedulingLabels + if len(c.IngressHost) == 0 && len(ingressHost) > 0 { + log.Info("setting default value for IngressHost", + zap.String("ingressHost", ingressHost), + ) + c.IngressHost = ingressHost + } + if len(c.IngressSecret) == 0 && len(ingressSecret) > 0 { + log.Info("setting default value for IngressSecret", + zap.String("ingressSecret", ingressSecret), + ) + c.IngressSecret = ingressSecret + } + if len(c.IngressHost) == 0 { + return errMissingIngressHost + } } return nil @@ -150,8 +179,8 @@ func (p *KubeRuntime) readState(ctx context.Context) error { ) // Validate that it will be possible to construct accessible URIs when running external to the kube cluster - if !IsRunningInCluster() && len(runtimeConfig.BaseAccessibleURI) == 0 { - return errors.New("BaseAccessibleURI must be set when running outside of the kubernetes cluster") + if !IsRunningInCluster() && len(runtimeConfig.IngressHost) == 0 { + return errors.New("IngressHost must be set when running outside of the kubernetes cluster") } clientset, err := p.getClientset() @@ -207,11 +236,18 @@ func (p *KubeRuntime) GetAccessibleURI() string { return p.node.URI } - baseURI := p.runtimeConfig().BaseAccessibleURI - nodeID := p.node.NodeID.String() - networkUUID := p.node.network.UUID + var ( + protocol = "http" + nodeID = p.node.NodeID.String() + networkUUID = p.node.network.UUID + runtimeConfig = p.runtimeConfig() + ) + // Assume tls is configured for an ingress secret + if len(runtimeConfig.IngressSecret) > 0 { + protocol = "https" + } - return fmt.Sprintf("%s/networks/%s/%s", baseURI, networkUUID, nodeID) + return fmt.Sprintf("%s://%s/networks/%s/%s", protocol, runtimeConfig.IngressHost, networkUUID, nodeID) } // GetAccessibleStakingAddress retrieves a StakingAddress for the node intended to be @@ -993,6 +1029,35 @@ func (p *KubeRuntime) createNodeIngress(ctx context.Context, serviceName string) pathType = networkingv1.PathTypeImplementationSpecific ) + // Build the ingress rules + ingressRules := []networkingv1.IngressRule{ + { + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{ + { + Path: pathPattern, + PathType: &pathType, + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: serviceName, + Port: networkingv1.ServiceBackendPort{ + Number: config.DefaultHTTPPort, + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Add host if not localhost + if !strings.HasPrefix(runtimeConfig.IngressHost, "localhost") { + ingressRules[0].Host = runtimeConfig.IngressHost + } + ingress := &networkingv1.Ingress{ ObjectMeta: metav1.ObjectMeta{ Name: serviceName, @@ -1012,31 +1077,20 @@ func (p *KubeRuntime) createNodeIngress(ctx context.Context, serviceName string) }, Spec: networkingv1.IngressSpec{ IngressClassName: &ingressClassName, - Rules: []networkingv1.IngressRule{ - { - IngressRuleValue: networkingv1.IngressRuleValue{ - HTTP: &networkingv1.HTTPIngressRuleValue{ - Paths: []networkingv1.HTTPIngressPath{ - { - Path: pathPattern, - PathType: &pathType, - Backend: networkingv1.IngressBackend{ - Service: &networkingv1.IngressServiceBackend{ - Name: serviceName, - Port: networkingv1.ServiceBackendPort{ - Number: config.DefaultHTTPPort, - }, - }, - }, - }, - }, - }, - }, - }, - }, + Rules: ingressRules, }, } + // Add TLS configuration if IngressSecret is set + if len(runtimeConfig.IngressSecret) > 0 && !strings.HasPrefix(runtimeConfig.IngressHost, "localhost") { + ingress.Spec.TLS = []networkingv1.IngressTLS{ + { + Hosts: []string{runtimeConfig.IngressHost}, + SecretName: runtimeConfig.IngressSecret, + }, + } + } + _, err = clientset.NetworkingV1().Ingresses(namespace).Create(ctx, ingress, metav1.CreateOptions{}) if err != nil { return fmt.Errorf("failed to create Ingress: %w", err) diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index cad7e186da39..f8d4299875c7 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -112,6 +112,10 @@ func StartKindCluster( return fmt.Errorf("failed to deploy ingress controller: %w", err) } + if err := createDefaultsConfigMap(ctx, log, configPath, configContext, DefaultTmpnetNamespace); err != nil { + return fmt.Errorf("failed to create defaults ConfigMap: %w", err) + } + return nil } @@ -410,3 +414,48 @@ func runHelmCommand(ctx context.Context, args ...string) error { cmd.Stderr = os.Stderr return cmd.Run() } + +// createDefaultsConfigMap creates a ConfigMap containing defaults for the tmpnet namespace. +func createDefaultsConfigMap(ctx context.Context, log logging.Logger, configPath string, configContext string, namespace string) error { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to get clientset: %w", err) + } + + configMapName := defaultsConfigMapName + + // Check if configmap already exists + _, err = clientset.CoreV1().ConfigMaps(namespace).Get(ctx, configMapName, metav1.GetOptions{}) + if err == nil { + log.Info("defaults ConfigMap already exists", + zap.String("namespace", namespace), + zap.String("configMap", configMapName), + ) + return nil + } + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to check for configmap %s/%s: %w", namespace, configMapName, err) + } + + log.Info("creating defaults ConfigMap", + zap.String("namespace", namespace), + zap.String("configMap", configMapName), + ) + + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: namespace, + }, + Data: map[string]string{ + ingressHostKey: "localhost:30791", + }, + } + + _, err = clientset.CoreV1().ConfigMaps(namespace).Create(ctx, configMap, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create configmap %s/%s: %w", namespace, configMapName, err) + } + + return nil +} diff --git a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml index e7f230a72515..21db7582fd0f 100644 --- a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml +++ b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml @@ -28,6 +28,9 @@ rules: - apiGroups: ["networking.k8s.io"] resources: ["ingresses"] verbs: ["get", "create"] +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get"] - apiGroups: [""] resources: ["endpoints"] verbs: ["get"] diff --git a/tests/load2/main/main.go b/tests/load2/main/main.go index 07646d4851c7..1985f8f6902d 100644 --- a/tests/load2/main/main.go +++ b/tests/load2/main/main.go @@ -69,7 +69,7 @@ func main() { e2e.NewTestEnvironment(tc, flagVars, network) ctx := tests.DefaultNotifyContext(0, tc.DeferCleanup) - wsURIs, err := tmpnet.GetNodeWebsocketURIs(ctx, network.Nodes, blockchainID, tc.DeferCleanup) + wsURIs, err := tmpnet.GetNodeWebsocketURIs(network.Nodes, blockchainID) require.NoError(err) registry := prometheus.NewRegistry() From 3818f0ddca09b0730223183fe2086c0544b90d3b Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 1 Jul 2025 10:27:32 +0000 Subject: [PATCH 10/12] fixup: Cache kubeconfig --- tests/fixture/tmpnet/kube_runtime.go | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 1c2eb7f168b8..1e4b1e5394b1 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -160,6 +160,8 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg type KubeRuntime struct { node *Node + + kubeConfig *restclient.Config } // readState reads the URI and staking address for the node if the node is running. @@ -818,13 +820,23 @@ func (p *KubeRuntime) runtimeConfig() *KubeRuntimeConfig { return p.node.getRuntimeConfig().Kube } +// getKubeconfig retrieves the kubeconfig for the target cluster. It +// will be cached after the first call to avoid unnecessary logging +// when running in-cluster. func (p *KubeRuntime) getKubeconfig() (*restclient.Config, error) { - runtimeConfig := p.runtimeConfig() - return GetClientConfig( - p.node.network.log, - runtimeConfig.ConfigPath, - runtimeConfig.ConfigContext, - ) + if p.kubeConfig == nil { + runtimeConfig := p.runtimeConfig() + config, err := GetClientConfig( + p.node.network.log, + runtimeConfig.ConfigPath, + runtimeConfig.ConfigContext, + ) + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + p.kubeConfig = config + } + return p.kubeConfig, nil } func (p *KubeRuntime) getClientset() (*kubernetes.Clientset, error) { From 79915253def2a94d8e5d09ab72d6d883ade25888 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 05:47:27 +0000 Subject: [PATCH 11/12] fixup: Ensure compatibility with non-localhost ingress host --- tests/fixture/tmpnet/kube_runtime.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 1e4b1e5394b1..ab86956291e3 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -903,6 +903,10 @@ func (p *KubeRuntime) getFlags() (FlagsMap, error) { flags[config.DataDirKey] = volumeMountPath // The node must bind to the Pod IP to enable the kubelet to access the http port for the readiness check flags[config.HTTPHostKey] = "0.0.0.0" + // Ensure compatibility with a non-localhost ingress host + if !IsRunningInCluster() && !strings.HasPrefix(p.runtimeConfig().IngressHost, "localhost") { + flags[config.HTTPAllowedHostsKey] = p.runtimeConfig().IngressHost + } return flags, nil } From f565e8f81361c2f4f613fa91fe1bff4d5fa1bc10 Mon Sep 17 00:00:00 2001 From: maru Date: Wed, 9 Jul 2025 02:13:41 +0000 Subject: [PATCH 12/12] [tmpnet] Enable installation of chaos mesh to local kind cluster --- scripts/kind-with-registry.sh | 2 +- tests/fixture/tmpnet/start_kind_cluster.go | 181 ++++++++++++++++----- tests/fixture/tmpnet/tmpnetctl/main.go | 7 +- 3 files changed, 148 insertions(+), 42 deletions(-) diff --git a/scripts/kind-with-registry.sh b/scripts/kind-with-registry.sh index 5027432722eb..7ceed8175721 100755 --- a/scripts/kind-with-registry.sh +++ b/scripts/kind-with-registry.sh @@ -47,7 +47,7 @@ nodes: - role: control-plane extraPortMappings: # Exposing a nodeport for nginx ingress is the reason this script needed to be copied and customized - # This port must match the value used to deploy the nginx controller by tests/fixture/tmpnet/start-kind-cluster.go + # This port must match the ingressNodePort constant in tests/fixture/tmpnet/start_kind_cluster.go - containerPort: 30791 hostPort: 30791 protocol: TCP diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index f8d4299875c7..b64bbc03e9a9 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -45,6 +45,17 @@ const ( ingressChartRepo = "https://kubernetes.github.io/ingress-nginx" ingressChartName = "ingress-nginx/ingress-nginx" ingressControllerName = "ingress-nginx-controller" + ingressNodePort = 30791 + + // Chaos Mesh constants + chaosMeshNamespace = "chaos-mesh" + chaosMeshReleaseName = "chaos-mesh" + chaosMeshChartRepo = "https://charts.chaos-mesh.org" + chaosMeshChartName = "chaos-mesh/chaos-mesh" + chaosMeshChartVersion = "2.7.2" + chaosMeshControllerName = "chaos-controller-manager" + chaosMeshDashboardName = "chaos-dashboard" + chaosMeshDashboardHost = "chaos-mesh.localhost" ) //go:embed yaml/tmpnet-rbac.yaml @@ -57,6 +68,7 @@ func StartKindCluster( configPath string, startMetricsCollector bool, startLogsCollector bool, + installChaosMesh bool, ) error { configContext := KindKubeconfigContext @@ -116,6 +128,12 @@ func StartKindCluster( return fmt.Errorf("failed to create defaults ConfigMap: %w", err) } + if installChaosMesh { + if err := deployChaosMesh(ctx, log, configPath, configContext); err != nil { + return fmt.Errorf("failed to deploy chaos mesh: %w", err) + } + } + return nil } @@ -342,7 +360,7 @@ func deployIngressController(ctx context.Context, log logging.Logger, configPath "--wait", "--set", "controller.service.type=NodePort", // This port value must match the port configured in scripts/kind-with-registry.sh - "--set", "controller.service.nodePorts.http=30791", + "--set", fmt.Sprintf("controller.service.nodePorts.http=%d", ingressNodePort), "--set", "controller.admissionWebhooks.enabled=false", "--set", "controller.config.proxy-read-timeout=600", "--set", "controller.config.proxy-send-timeout=600", @@ -355,7 +373,7 @@ func deployIngressController(ctx context.Context, log logging.Logger, configPath return fmt.Errorf("failed to install nginx-ingress: %w", err) } - return waitForIngressController(ctx, log, configPath, configContext) + return waitForDeployment(ctx, log, configPath, configContext, ingressNamespace, ingressControllerName, "nginx ingress controller") } // isIngressControllerRunning checks if the nginx ingress controller is already running. @@ -371,42 +389,6 @@ func isIngressControllerRunning(ctx context.Context, log logging.Logger, configP return isRunning, nil } -// waitForIngressController waits for the nginx ingress controller to be ready. -func waitForIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error { - clientset, err := GetClientset(log, configPath, configContext) - if err != nil { - return fmt.Errorf("failed to get clientset: %w", err) - } - - return wait.PollUntilContextCancel(ctx, statusCheckInterval, true /* immediate */, func(ctx context.Context) (bool, error) { - deployment, err := clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{}) - if err != nil { - log.Debug("failed to get nginx ingress controller deployment", - zap.String("namespace", ingressNamespace), - zap.String("deployment", ingressControllerName), - zap.Error(err), - ) - return false, nil - } - if deployment.Status.ReadyReplicas == 0 { - log.Debug("waiting for nginx ingress controller to become ready", - zap.String("namespace", ingressNamespace), - zap.String("deployment", ingressControllerName), - zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), - zap.Int32("replicas", deployment.Status.Replicas), - ) - return false, nil - } - - log.Info("nginx ingress controller is ready", - zap.String("namespace", ingressNamespace), - zap.String("deployment", ingressControllerName), - zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), - ) - return true, nil - }) -} - // runHelmCommand runs a Helm command with the given arguments. func runHelmCommand(ctx context.Context, args ...string) error { cmd := exec.CommandContext(ctx, "helm", args...) @@ -448,7 +430,7 @@ func createDefaultsConfigMap(ctx context.Context, log logging.Logger, configPath Namespace: namespace, }, Data: map[string]string{ - ingressHostKey: "localhost:30791", + ingressHostKey: fmt.Sprintf("localhost:%d", ingressNodePort), }, } @@ -459,3 +441,124 @@ func createDefaultsConfigMap(ctx context.Context, log logging.Logger, configPath return nil } + +// deployChaosMesh deploys Chaos Mesh using Helm. +func deployChaosMesh(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + log.Info("checking if chaos mesh is already running") + + isRunning, err := isChaosMeshRunning(ctx, log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to check chaos mesh status: %w", err) + } + if isRunning { + log.Info("chaos mesh already running") + return nil + } + + log.Info("deploying chaos mesh using Helm") + + // Add the helm repo for chaos-mesh + if err := runHelmCommand(ctx, "repo", "add", "chaos-mesh", chaosMeshChartRepo); err != nil { + return fmt.Errorf("failed to add chaos mesh helm repo: %w", err) + } + if err := runHelmCommand(ctx, "repo", "update"); err != nil { + return fmt.Errorf("failed to update helm repos: %w", err) + } + + // Install Chaos Mesh with all required settings including ingress + args := []string{ + "install", + chaosMeshReleaseName, + chaosMeshChartName, + "--namespace", chaosMeshNamespace, + "--create-namespace", + "--version", chaosMeshChartVersion, + "--wait", + "--set", "chaosDaemon.runtime=containerd", + "--set", "chaosDaemon.socketPath=/run/containerd/containerd.sock", + "--set", "dashboard.persistentVolume.enabled=true", + "--set", "dashboard.persistentVolume.storageClass=standard", + "--set", "dashboard.securityMode=false", + "--set", "controllerManager.leaderElection.enabled=false", + "--set", "dashboard.ingress.enabled=true", + "--set", "dashboard.ingress.ingressClassName=nginx", + "--set", "dashboard.ingress.hosts[0].name=" + chaosMeshDashboardHost, + } + + if err := runHelmCommand(ctx, args...); err != nil { + return fmt.Errorf("failed to install chaos mesh: %w", err) + } + + // Wait for Chaos Mesh to be ready + if err := waitForChaosMesh(ctx, log, configPath, configContext); err != nil { + return fmt.Errorf("chaos mesh deployment failed: %w", err) + } + + // Log access information + log.Info("Chaos Mesh installed successfully", + zap.String("dashboardURL", fmt.Sprintf("http://%s:%d", chaosMeshDashboardHost, ingressNodePort)), + ) + log.Warn("Chaos Mesh dashboard security is disabled - use only for local development") + + return nil +} + +// isChaosMeshRunning checks if Chaos Mesh is already running. +func isChaosMeshRunning(ctx context.Context, log logging.Logger, configPath string, configContext string) (bool, error) { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return false, err + } + + // Check if controller manager deployment exists + _, err = clientset.AppsV1().Deployments(chaosMeshNamespace).Get(ctx, chaosMeshControllerName, metav1.GetOptions{}) + return !apierrors.IsNotFound(err), nil +} + +// waitForChaosMesh waits for Chaos Mesh components to be ready. +func waitForChaosMesh(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + // Wait for controller manager + if err := waitForDeployment(ctx, log, configPath, configContext, chaosMeshNamespace, chaosMeshControllerName, "chaos mesh controller manager"); err != nil { + return fmt.Errorf("controller manager not ready: %w", err) + } + + // Wait for dashboard + return waitForDeployment(ctx, log, configPath, configContext, chaosMeshNamespace, chaosMeshDashboardName, "chaos mesh dashboard") +} + +// waitForDeployment waits for a deployment to have at least one ready replica. +func waitForDeployment(ctx context.Context, log logging.Logger, configPath string, configContext string, namespace string, deploymentName string, displayName string) error { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to get clientset: %w", err) + } + + log.Info("waiting for " + displayName + " to be ready") + return wait.PollUntilContextCancel(ctx, statusCheckInterval, true /* immediate */, func(ctx context.Context) (bool, error) { + deployment, err := clientset.AppsV1().Deployments(namespace).Get(ctx, deploymentName, metav1.GetOptions{}) + if err != nil { + log.Debug("failed to get "+displayName+" deployment", + zap.String("namespace", namespace), + zap.String("deployment", deploymentName), + zap.Error(err), + ) + return false, nil + } + if deployment.Status.ReadyReplicas == 0 { + log.Debug("waiting for "+displayName+" to become ready", + zap.String("namespace", namespace), + zap.String("deployment", deploymentName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + zap.Int32("replicas", deployment.Status.Replicas), + ) + return false, nil + } + + log.Info(displayName+" is ready", + zap.String("namespace", namespace), + zap.String("deployment", deploymentName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + ) + return true, nil + }) +} diff --git a/tests/fixture/tmpnet/tmpnetctl/main.go b/tests/fixture/tmpnet/tmpnetctl/main.go index be261091fdc1..da2accd9e45a 100644 --- a/tests/fixture/tmpnet/tmpnetctl/main.go +++ b/tests/fixture/tmpnet/tmpnetctl/main.go @@ -271,8 +271,9 @@ func main() { rootCmd.AddCommand(checkLogsCmd) var ( - kubeconfigVars *flags.KubeconfigVars - collectorVars *flags.CollectorVars + kubeconfigVars *flags.KubeconfigVars + collectorVars *flags.CollectorVars + installChaosMesh bool ) startKindClusterCmd := &cobra.Command{ Use: "start-kind-cluster", @@ -302,11 +303,13 @@ func main() { kubeconfigVars.Path, collectorVars.StartMetricsCollector, collectorVars.StartLogsCollector, + installChaosMesh, ) }, } kubeconfigVars = flags.NewKubeconfigFlagSetVars(startKindClusterCmd.PersistentFlags()) collectorVars = flags.NewCollectorFlagSetVars(startKindClusterCmd.PersistentFlags()) + startKindClusterCmd.PersistentFlags().BoolVar(&installChaosMesh, "install-chaos-mesh", false, "Install Chaos Mesh in the kind cluster") rootCmd.AddCommand(startKindClusterCmd) if err := rootCmd.Execute(); err != nil {