From 15e503c624e282ce3aa36aeb06dfc88a24960c1b Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 03:41:17 +0000 Subject: [PATCH 01/14] [tmpnet] Source tmpnet defaults from a configmap Previously the scheduling label and value required to enable exclusive scheduling were defined as flag defaults. To enable the cluster to define these defaults, the defaults are now sourced from a configmap in the target namespace. --- tests/fixture/tmpnet/flags/kube_runtime.go | 14 ++---- tests/fixture/tmpnet/kube_runtime.go | 53 ++++++++++++++++++++++ tests/fixture/tmpnet/network.go | 8 +++- tests/fixture/tmpnet/network_test.go | 2 +- 4 files changed, 65 insertions(+), 12 deletions(-) diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go index cc5304cb7952..3eee81ea99d9 100644 --- a/tests/fixture/tmpnet/flags/kube_runtime.go +++ b/tests/fixture/tmpnet/flags/kube_runtime.go @@ -20,10 +20,9 @@ const ( ) var ( - errKubeNamespaceRequired = errors.New("--kube-namespace is required") - errKubeImageRequired = errors.New("--kube-image is required") - errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) - errKubeSchedulingLabelRequired = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when --kube-use-exclusive-scheduling is enabled") + errKubeNamespaceRequired = errors.New("--kube-namespace is required") + errKubeImageRequired = errors.New("--kube-image is required") + errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) ) type kubeRuntimeVars struct { @@ -77,13 +76,13 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui stringVar( &v.schedulingLabelKey, "kube-scheduling-label-key", - "purpose", + "", kubeDocPrefix+"The label key to use for exclusive scheduling for node selection and toleration", ) stringVar( &v.schedulingLabelValue, "kube-scheduling-label-value", - "higher-spec", + "", kubeDocPrefix+"The label value to use for exclusive scheduling for node selection and toleration", ) } @@ -98,9 +97,6 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } - if v.useExclusiveScheduling && (len(v.schedulingLabelKey) == 0 || len(v.schedulingLabelValue) == 0) { - return nil, errKubeSchedulingLabelRequired - } return &tmpnet.KubeRuntimeConfig{ ConfigPath: v.config.Path, ConfigContext: v.config.Context, diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index c3bb4e72e4ab..c6fb229e20ca 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -21,6 +21,7 @@ import ( "github.com/ava-labs/avalanchego/config" "github.com/ava-labs/avalanchego/ids" + "github.com/ava-labs/avalanchego/utils/logging" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -49,8 +50,13 @@ const ( // are never scheduled to the same nodes. antiAffinityLabelKey = "tmpnet-scheduling" antiAffinityLabelValue = "exclusive" + + // Name of config map containing tmpnet defaults + kubeRuntimeConfigMapName = "tmpnet" ) +var errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") + type KubeRuntimeConfig struct { // Path to the kubeconfig file identifying the target cluster ConfigPath string `json:"configPath,omitempty"` @@ -72,6 +78,53 @@ type KubeRuntimeConfig struct { SchedulingLabelValue string `json:"schedulingLabelValue,omitempty"` } +// ensureDefaults sets cluster-specific defaults for fields not already set by flags. +func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logger) error { + // Only source defaults from the cluster if exclusive scheduling is enabled + if !c.UseExclusiveScheduling { + return nil + } + + clientset, err := GetClientset(log, c.ConfigPath, c.ConfigContext) + if err != nil { + return err + } + + log.Info("attempting to retrieve configmap containing tmpnet defaults", + zap.String("namespace", c.Namespace), + zap.String("configMap", kubeRuntimeConfigMapName), + ) + + configMap, err := clientset.CoreV1().ConfigMaps(c.Namespace).Get(ctx, kubeRuntimeConfigMapName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get ConfigMap: %w", err) + } + + var ( + schedulingLabelKey = configMap.Data["defaultSchedulingLabelKey"] + schedulingLabelValue = configMap.Data["defaultSchedulingLabelValue"] + ) + if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { + log.Info("setting default value for SchedulingLabelKey", + zap.String("schedulingLabelKey", schedulingLabelKey), + ) + c.SchedulingLabelKey = schedulingLabelKey + } + if len(c.SchedulingLabelValue) == 0 && len(schedulingLabelValue) > 0 { + log.Info("setting default value for SchedulingLabelValue", + zap.String("schedulingLabelValue", schedulingLabelValue), + ) + c.SchedulingLabelValue = schedulingLabelValue + } + + // Validate that the scheduling labels are now set + if len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0 { + return errMissingSchedulingLabels + } + + return nil +} + type KubeRuntime struct { node *Node } diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index 25c0f6c42b3c..611bd4c548fb 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -177,7 +177,7 @@ func BootstrapNewNetwork( if err := checkVMBinaries(log, network.Subnets, network.DefaultRuntimeConfig.Process); err != nil { return err } - if err := network.EnsureDefaultConfig(log); err != nil { + if err := network.EnsureDefaultConfig(ctx, log); err != nil { return err } if err := network.Create(rootNetworkDir); err != nil { @@ -234,7 +234,7 @@ func ReadNetwork(ctx context.Context, log logging.Logger, dir string) (*Network, } // Initializes a new network with default configuration. -func (n *Network) EnsureDefaultConfig(log logging.Logger) error { +func (n *Network) EnsureDefaultConfig(ctx context.Context, log logging.Logger) error { log.Info("preparing configuration for new network", zap.Any("runtimeConfig", n.DefaultRuntimeConfig), ) @@ -281,6 +281,10 @@ func (n *Network) EnsureDefaultConfig(log logging.Logger) error { return errMissingRuntimeConfig } + if n.DefaultRuntimeConfig.Kube != nil { + return n.DefaultRuntimeConfig.Kube.ensureDefaults(ctx, log) + } + return nil } diff --git a/tests/fixture/tmpnet/network_test.go b/tests/fixture/tmpnet/network_test.go index fa789d7e9935..c632b41186b0 100644 --- a/tests/fixture/tmpnet/network_test.go +++ b/tests/fixture/tmpnet/network_test.go @@ -26,7 +26,7 @@ func TestNetworkSerialization(t *testing.T) { network.PrimarySubnetConfig = ConfigMap{ "validatorOnly": true, } - require.NoError(network.EnsureDefaultConfig(logging.NoLog{})) + require.NoError(network.EnsureDefaultConfig(ctx, logging.NoLog{})) require.NoError(network.Create(tmpDir)) // Ensure node runtime is initialized require.NoError(network.readNodes(ctx)) From 89597cebf129f10743c8084a71962cf5ef65243c Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 04:32:26 +0000 Subject: [PATCH 02/14] fixup: Align with expected configmap --- tests/fixture/tmpnet/kube_runtime.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index c6fb229e20ca..b6095a46caf5 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -80,8 +80,8 @@ type KubeRuntimeConfig struct { // ensureDefaults sets cluster-specific defaults for fields not already set by flags. func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logger) error { - // Only source defaults from the cluster if exclusive scheduling is enabled - if !c.UseExclusiveScheduling { + requireSchedulingDefaults := c.UseExclusiveScheduling && (len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0) + if !requireSchedulingDefaults { return nil } @@ -101,8 +101,8 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg } var ( - schedulingLabelKey = configMap.Data["defaultSchedulingLabelKey"] - schedulingLabelValue = configMap.Data["defaultSchedulingLabelValue"] + schedulingLabelKey = configMap.Data["schedulingLabelKey"] + schedulingLabelValue = configMap.Data["schedulingLabelValue"] ) if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { log.Info("setting default value for SchedulingLabelKey", From a16b090b20e244cfc4b16c49ba3c682b29173322 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 05:14:45 +0000 Subject: [PATCH 03/14] fixup: Set defaults before logging --- tests/fixture/tmpnet/network.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index 611bd4c548fb..fe4f1d68a1f3 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -235,6 +235,13 @@ func ReadNetwork(ctx context.Context, log logging.Logger, dir string) (*Network, // Initializes a new network with default configuration. func (n *Network) EnsureDefaultConfig(ctx context.Context, log logging.Logger) error { + // Populate runtime defaults before logging it + if n.DefaultRuntimeConfig.Kube != nil { + if err := n.DefaultRuntimeConfig.Kube.ensureDefaults(ctx, log); err != nil { + return err + } + } + log.Info("preparing configuration for new network", zap.Any("runtimeConfig", n.DefaultRuntimeConfig), ) @@ -281,10 +288,6 @@ func (n *Network) EnsureDefaultConfig(ctx context.Context, log logging.Logger) e return errMissingRuntimeConfig } - if n.DefaultRuntimeConfig.Kube != nil { - return n.DefaultRuntimeConfig.Kube.ensureDefaults(ctx, log) - } - return nil } From 12e189f58fd26cd0fab1cdcfdf083c27f82393cc Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 05:33:18 +0000 Subject: [PATCH 04/14] fixup: Rename the configmap name --- tests/fixture/tmpnet/kube_runtime.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index b6095a46caf5..4f5343ab84ed 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -52,7 +52,7 @@ const ( antiAffinityLabelValue = "exclusive" // Name of config map containing tmpnet defaults - kubeRuntimeConfigMapName = "tmpnet" + defaultsConfigMapName = "tmpnet-defaults" ) var errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") @@ -92,10 +92,10 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg log.Info("attempting to retrieve configmap containing tmpnet defaults", zap.String("namespace", c.Namespace), - zap.String("configMap", kubeRuntimeConfigMapName), + zap.String("configMap", defaultsConfigMapName), ) - configMap, err := clientset.CoreV1().ConfigMaps(c.Namespace).Get(ctx, kubeRuntimeConfigMapName, metav1.GetOptions{}) + configMap, err := clientset.CoreV1().ConfigMaps(c.Namespace).Get(ctx, defaultsConfigMapName, metav1.GetOptions{}) if err != nil { return fmt.Errorf("failed to get ConfigMap: %w", err) } From 86d84b14e28a1946c97d63b4c02c400542b80f9c Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 17 Jun 2025 00:17:21 -0700 Subject: [PATCH 05/14] [tmpnet] Enable externally accessible URIs for kube-hosted nodes Previously, access from outside of a kube cluster was enabled by port-forwarding through the kube API. This approach turned out incompatible with load testing because it greatly limited the rate at which transactions could be sent. The port-forwarding is replaced with nginx+ingress to ensure minimum overhead when running outside of a kube cluster. --- flake.nix | 28 +- scripts/kind-with-registry.sh | 90 ++++++ tests/antithesis/config.go | 8 +- tests/e2e/c/dynamic_fees.go | 2 +- tests/e2e/p/interchain_workflow.go | 2 +- tests/e2e/p/l1.go | 6 +- tests/e2e/p/staking_rewards.go | 4 +- tests/e2e/p/validator_sets.go | 6 +- tests/e2e/vms/xsvm.go | 8 +- tests/e2e/x/transfer/virtuous.go | 11 +- tests/fixture/e2e/env.go | 42 +-- tests/fixture/e2e/helpers.go | 24 -- tests/fixture/tmpnet/flags/kube_runtime.go | 20 +- tests/fixture/tmpnet/kube_runtime.go | 327 +++++++++++++++++++-- tests/fixture/tmpnet/monitor_kube.go | 2 +- tests/fixture/tmpnet/network.go | 16 +- tests/fixture/tmpnet/node.go | 19 +- tests/fixture/tmpnet/process_runtime.go | 7 +- tests/fixture/tmpnet/start_kind_cluster.go | 121 +++++++- tests/fixture/tmpnet/utils.go | 30 +- tests/load/c/main/main.go | 2 +- 21 files changed, 590 insertions(+), 185 deletions(-) create mode 100755 scripts/kind-with-registry.sh diff --git a/flake.nix b/flake.nix index e4a2580520dd..1ee25aec4e68 100644 --- a/flake.nix +++ b/flake.nix @@ -48,7 +48,6 @@ k9s # Kubernetes TUI kind # Kubernetes-in-Docker kubernetes-helm # Helm CLI (Kubernetes package manager) - self.packages.${system}.kind-with-registry # Script installing kind configured with a local registry # Linters shellcheck @@ -64,32 +63,11 @@ # macOS-specific frameworks darwin.apple_sdk.frameworks.Security ]; - }; - }); - - # Package to install the kind-with-registry script - packages = forAllSystems ({ pkgs }: { - kind-with-registry = pkgs.stdenv.mkDerivation { - pname = "kind-with-registry"; - version = "1.0.0"; - src = pkgs.fetchurl { - url = "https://raw.githubusercontent.com/kubernetes-sigs/kind/7cb9e6be25b48a0e248097eef29d496ab1a044d0/site/static/examples/kind-with-registry.sh"; - sha256 = "0gri0x0ygcwmz8l4h6zzsvydw8rsh7qa8p5218d4hncm363i81hv"; - }; - - phases = [ "installPhase" ]; - - installPhase = '' - mkdir -p $out/bin - install -m755 $src $out/bin/kind-with-registry.sh + # Add scripts/ directory to PATH so kind-with-registry.sh is accessible + shellHook = '' + export PATH="$PWD/scripts:$PATH" ''; - - meta = with pkgs.lib; { - description = "Script to set up kind with a local registry"; - license = licenses.mit; - maintainers = with maintainers; [ "maru-ava" ]; - }; }; }); }; diff --git a/scripts/kind-with-registry.sh b/scripts/kind-with-registry.sh new file mode 100755 index 000000000000..5027432722eb --- /dev/null +++ b/scripts/kind-with-registry.sh @@ -0,0 +1,90 @@ +#!/bin/sh +# Based on https://raw.githubusercontent.com/kubernetes-sigs/kind/7cb9e6be25b48a0e248097eef29d496ab1a044d0/site/static/examples/kind-with-registry.sh +# Original work Copyright 2019 The Kubernetes Authors +# Modifications Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +# See the file LICENSE for licensing terms. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO(marun) Migrate this script to golang +set -o errexit + +# 1. Create registry container unless it already exists +reg_name='kind-registry' +reg_port='5001' +if [ "$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" != 'true' ]; then + docker run \ + -d --restart=always -p "127.0.0.1:${reg_port}:5000" --network bridge --name "${reg_name}" \ + registry:2 +fi + +# 2. Create kind cluster with containerd registry config dir enabled +# TODO: kind will eventually enable this by default and this patch will +# be unnecessary. +# +# See: +# https://github.com/kubernetes-sigs/kind/issues/2875 +# https://github.com/containerd/containerd/blob/main/docs/cri/config.md#registry-configuration +# See: https://github.com/containerd/containerd/blob/main/docs/hosts.md +cat <= %d", tmpnet.MinimumVolumeSizeGB) + errKubeNamespaceRequired = errors.New("--kube-namespace is required") + errKubeImageRequired = errors.New("--kube-image is required") + errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) + errKubeBaseAccessibleURIRequired = errors.New("--kube-base-accessible-uri is required when running outside of cluster") ) type kubeRuntimeVars struct { @@ -32,6 +34,7 @@ type kubeRuntimeVars struct { useExclusiveScheduling bool schedulingLabelKey string schedulingLabelValue string + baseAccessibleURI string config *KubeconfigVars } @@ -85,6 +88,12 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui "", kubeDocPrefix+"The label value to use for exclusive scheduling for node selection and toleration", ) + stringVar( + &v.baseAccessibleURI, + "kube-base-accessible-uri", + "http://localhost:30791", + kubeDocPrefix+"The base URI for constructing node URIs when running outside of the cluster hosting nodes", + ) } func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, error) { @@ -97,6 +106,9 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } + if !tmpnet.IsRunningInCluster() && len(v.baseAccessibleURI) == 0 { + return nil, errKubeBaseAccessibleURIRequired + } return &tmpnet.KubeRuntimeConfig{ ConfigPath: v.config.Path, ConfigContext: v.config.Context, @@ -106,5 +118,7 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err UseExclusiveScheduling: v.useExclusiveScheduling, SchedulingLabelKey: v.schedulingLabelKey, SchedulingLabelValue: v.schedulingLabelValue, + // Strip trailing slashes to simplify path composition + BaseAccessibleURI: strings.TrimRight(v.baseAccessibleURI, "/"), }, nil } diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 4f5343ab84ed..63903a191347 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -16,6 +16,7 @@ import ( "go.uber.org/zap" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" @@ -24,6 +25,7 @@ import ( "github.com/ava-labs/avalanchego/utils/logging" corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" restclient "k8s.io/client-go/rest" @@ -76,6 +78,8 @@ type KubeRuntimeConfig struct { SchedulingLabelKey string `json:"schedulingLabelKey,omitempty"` // Label value to use for exclusive scheduling for node selection and toleration SchedulingLabelValue string `json:"schedulingLabelValue,omitempty"` + // Base URI for constructing node URIs when running outside of the cluster hosting nodes (e.g., "http://localhost:30791") + BaseAccessibleURI string `json:"baseAccessibleURI,omitempty"` } // ensureDefaults sets cluster-specific defaults for fields not already set by flags. @@ -145,6 +149,11 @@ func (p *KubeRuntime) readState(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) + // Validate that it will be possible to construct accessible URIs when running external to the kube cluster + if !IsRunningInCluster() && len(runtimeConfig.BaseAccessibleURI) == 0 { + return errors.New("BaseAccessibleURI must be set when running outside of the kubernetes cluster") + } + clientset, err := p.getClientset() if err != nil { return err @@ -187,31 +196,27 @@ func (p *KubeRuntime) readState(ctx context.Context) error { return nil } -// GetLocalURI retrieves a URI for the node intended to be accessible from this -// process until the provided cancel function is called. -func (p *KubeRuntime) GetLocalURI(ctx context.Context) (string, func(), error) { - if len(p.node.URI) == 0 { - // Assume that an empty URI indicates a need to read pod state - if err := p.readState(ctx); err != nil { - return "", func() {}, fmt.Errorf("failed to read Pod state: %w", err) - } - } - - // Use direct pod URI if running inside the cluster +// GetAccessibleURI retrieves a URI for the node accessible from where +// this process is running. If the process is running inside a kube +// cluster, the node and the process will be assumed to be running in the +// same kube cluster and the node's URI be used. If the process is +// running outside of a kube cluster, a URI accessible from outside of +// the cluster will be used. +func (p *KubeRuntime) GetAccessibleURI() string { if IsRunningInCluster() { - return p.node.URI, func() {}, nil + return p.node.URI } - port, stopChan, err := p.forwardPort(ctx, config.DefaultHTTPPort) - if err != nil { - return "", nil, err - } - return fmt.Sprintf("http://127.0.0.1:%d", port), func() { close(stopChan) }, nil + baseURI := p.runtimeConfig().BaseAccessibleURI + nodeID := p.node.NodeID.String() + networkUUID := p.node.network.UUID + + return fmt.Sprintf("%s/networks/%s/%s", baseURI, networkUUID, nodeID) } -// GetLocalStakingAddress retrieves a StakingAddress for the node intended to be +// GetAccessibleStakingAddress retrieves a StakingAddress for the node intended to be // accessible from this process until the provided cancel function is called. -func (p *KubeRuntime) GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { +func (p *KubeRuntime) GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { if p.node.StakingAddress == (netip.AddrPort{}) { // Assume that an empty staking address indicates a need to retrieve pod state if err := p.readState(ctx); err != nil { @@ -364,6 +369,24 @@ func (p *KubeRuntime) Start(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) + // Create Service for the node (prefix with 's-' for DNS compatibility) + serviceName := "s-" + statefulSetName + if err := p.createNodeService(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Service for node: %w", err) + } + + // Create Ingress for the node + if err := p.createNodeIngress(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Ingress for node: %w", err) + } + + // Wait for ingress to be ready if running outside cluster + if !IsRunningInCluster() { + if err := p.waitForIngressReadiness(ctx, serviceName); err != nil { + return fmt.Errorf("failed to wait for Ingress readiness: %w", err) + } + } + return p.ensureBootstrapIP(ctx) } @@ -624,9 +647,6 @@ func (p *KubeRuntime) Restart(ctx context.Context) error { } // IsHealthy checks if the node is running and healthy. -// -// TODO(marun) Add WaitForHealthy as a runtime method to minimize API calls required and -// enable reuse of forwarded connection when running external to the kubernetes cluster func (p *KubeRuntime) IsHealthy(ctx context.Context) (bool, error) { err := p.readState(ctx) if err != nil { @@ -636,13 +656,7 @@ func (p *KubeRuntime) IsHealthy(ctx context.Context) (bool, error) { return false, errNotRunning } - uri, cancel, err := p.GetLocalURI(ctx) - if err != nil { - return false, err - } - defer cancel() - - healthReply, err := CheckNodeHealth(ctx, uri) + healthReply, err := CheckNodeHealth(ctx, p.GetAccessibleURI()) if errors.Is(err, ErrUnrecoverableNodeHealthCheck) { return false, err } else if err != nil { @@ -890,6 +904,261 @@ func configureExclusiveScheduling(template *corev1.PodTemplateSpec, labelKey str } } +// createNodeService creates a Kubernetes Service for the node to enable ingress routing +func (p *KubeRuntime) createNodeService(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + ) + + log.Debug("creating Service for node", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + service := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: namespace, + Labels: map[string]string{ + "app": serviceName, + "network-uuid": p.node.network.UUID, + "node-id": nodeID, + }, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + "network_uuid": p.node.network.UUID, + "node_id": nodeID, + }, + Ports: []corev1.ServicePort{ + { + Name: "http", + Port: config.DefaultHTTPPort, + TargetPort: intstr.FromInt(config.DefaultHTTPPort), + Protocol: corev1.ProtocolTCP, + }, + }, + Type: corev1.ServiceTypeClusterIP, + }, + } + + _, err = clientset.CoreV1().Services(namespace).Create(ctx, service, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Service: %w", err) + } + + log.Debug("created Service", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + return nil +} + +// createNodeIngress creates a Kubernetes Ingress for the node to enable external access +func (p *KubeRuntime) createNodeIngress(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + networkUUID = p.node.network.UUID + ) + + log.Debug("creating Ingress for node", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + var ( + ingressClassName = "nginx" // Assume nginx ingress controller + // Path pattern: /networks//(/|$)(.*) + // Using (/|$)(.*) to properly handle trailing slashes + pathPattern = fmt.Sprintf("/networks/%s/%s", networkUUID, nodeID) + "(/|$)(.*)" + pathType = networkingv1.PathTypeImplementationSpecific + ) + + ingress := &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: namespace, + Labels: map[string]string{ + "app": serviceName, + "network-uuid": networkUUID, + "node-id": nodeID, + }, + Annotations: map[string]string{ + "nginx.ingress.kubernetes.io/use-regex": "true", + "nginx.ingress.kubernetes.io/rewrite-target": "/$2", + "nginx.ingress.kubernetes.io/proxy-body-size": "0", + "nginx.ingress.kubernetes.io/proxy-read-timeout": "600", + "nginx.ingress.kubernetes.io/proxy-send-timeout": "600", + }, + }, + Spec: networkingv1.IngressSpec{ + IngressClassName: &ingressClassName, + Rules: []networkingv1.IngressRule{ + { + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{ + { + Path: pathPattern, + PathType: &pathType, + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: serviceName, + Port: networkingv1.ServiceBackendPort{ + Number: config.DefaultHTTPPort, + }, + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + _, err = clientset.NetworkingV1().Ingresses(namespace).Create(ctx, ingress, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Ingress: %w", err) + } + + log.Debug("created Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + zap.String("path", pathPattern), + ) + + return nil +} + +// waitForIngressReadiness waits for the ingress to be ready and able to route traffic +// This prevents 503 errors when health checks are performed immediately after node start +func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + ) + + log.Debug("waiting for Ingress readiness", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + // Wait for the ingress to exist and service endpoints to be available + err = wait.PollUntilContextCancel( + ctx, + statusCheckInterval, + true, // immediate + func(ctx context.Context) (bool, error) { + // Check if ingress exists + _, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + log.Debug("waiting for Ingress to be created", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + if err != nil { + log.Warn("failed to retrieve Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + zap.Error(err), + ) + return false, nil + } + + // Check if service endpoints are available + endpoints, err := clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + log.Debug("waiting for Service endpoints to be created", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + return false, nil + } + if err != nil { + log.Warn("failed to retrieve Service endpoints", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + zap.Error(err), + ) + return false, nil + } + + // Check if endpoints have at least one ready address + hasReadyEndpoints := false + for _, subset := range endpoints.Subsets { + if len(subset.Addresses) > 0 { + hasReadyEndpoints = true + break + } + } + + if !hasReadyEndpoints { + log.Debug("waiting for Service endpoints to have ready addresses", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + return false, nil + } + + log.Debug("Ingress and Service endpoints are ready", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return true, nil + }, + ) + if err != nil { + return fmt.Errorf("failed to wait for Ingress %s/%s readiness: %w", namespace, serviceName, err) + } + + log.Debug("Ingress is ready", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + + return nil +} + // IsRunningInCluster detects if this code is running inside a Kubernetes cluster // by checking for the presence of the service account token that's automatically // mounted in every pod. diff --git a/tests/fixture/tmpnet/monitor_kube.go b/tests/fixture/tmpnet/monitor_kube.go index 1bd5cc11cbf2..9128e55c80c9 100644 --- a/tests/fixture/tmpnet/monitor_kube.go +++ b/tests/fixture/tmpnet/monitor_kube.go @@ -37,7 +37,7 @@ type kubeCollectorConfig struct { } // DeployKubeCollectors deploys collectors of logs and metrics to a Kubernetes cluster. -func DeployKubeCollectors( +func deployKubeCollectors( ctx context.Context, log logging.Logger, configPath string, diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index fe4f1d68a1f3..d167db12c48a 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -449,11 +449,7 @@ func (n *Network) Bootstrap(ctx context.Context, log logging.Logger) error { } // Don't restart the node during subnet creation since it will always be restarted afterwards. - uri, cancel, err := bootstrapNode.GetLocalURI(ctx) - if err != nil { - return err - } - defer cancel() + uri := bootstrapNode.GetAccessibleURI() if err := n.CreateSubnets(ctx, log, uri, false /* restartRequired */); err != nil { return err } @@ -784,11 +780,9 @@ func (n *Network) GetNode(nodeID ids.NodeID) (*Node, error) { return nil, fmt.Errorf("%s is not known to the network", nodeID) } -// GetNodeURIs returns the URIs of nodes in the network that are running and not ephemeral. The URIs -// returned are guaranteed be reachable by the caller until the cleanup function is called regardless -// of whether the nodes are running as local processes or in a kube cluster. -func (n *Network) GetNodeURIs(ctx context.Context, deferCleanupFunc func(func())) ([]NodeURI, error) { - return GetNodeURIs(ctx, n.Nodes, deferCleanupFunc) +// GetNodeURIs returns the accessible URIs of nodes in the network that are running and not ephemeral. +func (n *Network) GetNodeURIs() []NodeURI { + return GetNodeURIs(n.Nodes) } // GetAvailableNodeIDs returns the node IDs of nodes in the network that are running and not ephemeral. @@ -961,7 +955,7 @@ func waitForHealthy(ctx context.Context, log logging.Logger, nodes []*Node) erro unhealthyNodes.Remove(node) log.Info("node is healthy", zap.Stringer("nodeID", node.NodeID), - zap.String("uri", node.URI), + zap.String("uri", node.GetAccessibleURI()), ) } diff --git a/tests/fixture/tmpnet/node.go b/tests/fixture/tmpnet/node.go index a76eb02a7e44..347253356b68 100644 --- a/tests/fixture/tmpnet/node.go +++ b/tests/fixture/tmpnet/node.go @@ -41,8 +41,8 @@ var ( // NodeRuntime defines the methods required to support running a node. type NodeRuntime interface { readState(ctx context.Context) error - GetLocalURI(ctx context.Context) (string, func(), error) - GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) + GetAccessibleURI() string + GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) Start(ctx context.Context) error InitiateStop(ctx context.Context) error WaitForStopped(ctx context.Context) error @@ -199,12 +199,12 @@ func (n *Node) readState(ctx context.Context) error { return n.getRuntime().readState(ctx) } -func (n *Node) GetLocalURI(ctx context.Context) (string, func(), error) { - return n.getRuntime().GetLocalURI(ctx) +func (n *Node) GetAccessibleURI() string { + return n.getRuntime().GetAccessibleURI() } -func (n *Node) GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { - return n.getRuntime().GetLocalStakingAddress(ctx) +func (n *Node) GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { + return n.getRuntime().GetAccessibleStakingAddress(ctx) } // Writes the current state of the metrics endpoint to disk @@ -213,12 +213,7 @@ func (n *Node) SaveMetricsSnapshot(ctx context.Context) error { // No URI to request metrics from return nil } - baseURI, cancel, err := n.GetLocalURI(ctx) - if err != nil { - return nil - } - defer cancel() - uri := baseURI + "/ext/metrics" + uri := n.GetAccessibleURI() + "/ext/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, uri, nil) if err != nil { return err diff --git a/tests/fixture/tmpnet/process_runtime.go b/tests/fixture/tmpnet/process_runtime.go index 9088f0928210..94d7154c3d59 100644 --- a/tests/fixture/tmpnet/process_runtime.go +++ b/tests/fixture/tmpnet/process_runtime.go @@ -417,11 +417,12 @@ func (p *ProcessRuntime) writeMonitoringConfigFile(name string, config []ConfigM return nil } -func (p *ProcessRuntime) GetLocalURI(_ context.Context) (string, func(), error) { - return p.node.URI, func() {}, nil +// GetAccessibleURI returns the URI that can be used to access the node's API. +func (p *ProcessRuntime) GetAccessibleURI() string { + return p.node.URI } -func (p *ProcessRuntime) GetLocalStakingAddress(_ context.Context) (netip.AddrPort, func(), error) { +func (p *ProcessRuntime) GetAccessibleStakingAddress(_ context.Context) (netip.AddrPort, func(), error) { return p.node.StakingAddress, func() {}, nil } diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index 1957005c4d21..4b49c179fcd6 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -13,6 +13,7 @@ import ( "strings" "go.uber.org/zap" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" @@ -37,6 +38,13 @@ const ( // TODO(marun) Check for the presence of the context rather than string matching on this error missingContextMsg = `context "` + KindKubeconfigContext + `" does not exist` + + // Ingress controller constants + ingressNamespace = "ingress-nginx" + ingressReleaseName = "ingress-nginx" + ingressChartRepo = "https://kubernetes.github.io/ingress-nginx" + ingressChartName = "ingress-nginx/ingress-nginx" + ingressControllerName = "ingress-nginx-controller" ) //go:embed yaml/tmpnet-rbac.yaml @@ -96,10 +104,14 @@ func StartKindCluster( return fmt.Errorf("failed to create service account kubeconfig context: %w", err) } - if err := DeployKubeCollectors(ctx, log, configPath, configContext, startMetricsCollector, startLogsCollector); err != nil { + if err := deployKubeCollectors(ctx, log, configPath, configContext, startMetricsCollector, startLogsCollector); err != nil { return fmt.Errorf("failed to deploy kube collectors: %w", err) } + if err := deployIngressController(ctx, log, configPath, configContext); err != nil { + return fmt.Errorf("failed to deploy ingress controller: %w", err) + } + return nil } @@ -285,3 +297,110 @@ func createServiceAccountKubeconfig( return nil } + +// deployIngressController deploys the nginx ingress controller using Helm. +func deployIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + log.Info("checking if nginx ingress controller is already running") + + isRunning, err := isIngressControllerRunning(ctx, log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to check nginx ingress controller status: %w", err) + } + if isRunning { + log.Info("nginx ingress controller already running") + return nil + } + + log.Info("deploying nginx ingress controller using Helm") + + // Add the helm repo for ingress-nginx + if err := runHelmCommand(ctx, "repo", "add", "ingress-nginx", ingressChartRepo); err != nil { + return fmt.Errorf("failed to add helm repo: %w", err) + } + if err := runHelmCommand(ctx, "repo", "update"); err != nil { + return fmt.Errorf("failed to update helm repos: %w", err) + } + + // Install nginx-ingress with values set directly via flags + // Using fixed nodePort 30791 for cross-platform compatibility + args := []string{ + "install", + ingressReleaseName, + ingressChartName, + "--namespace", ingressNamespace, + "--create-namespace", + "--wait", + "--set", "controller.service.type=NodePort", + // This port value must match the port configured in scripts/kind-with-registry.sh + "--set", "controller.service.nodePorts.http=30791", + "--set", "controller.admissionWebhooks.enabled=false", + "--set", "controller.config.proxy-read-timeout=600", + "--set", "controller.config.proxy-send-timeout=600", + "--set", "controller.config.proxy-body-size=0", + "--set", "controller.config.proxy-http-version=1.1", + "--set", "controller.metrics.enabled=true", + } + + if err := runHelmCommand(ctx, args...); err != nil { + return fmt.Errorf("failed to install nginx-ingress: %w", err) + } + + return waitForIngressController(ctx, log, configPath, configContext) +} + +// isIngressControllerRunning checks if the nginx ingress controller is already running. +func isIngressControllerRunning(ctx context.Context, log logging.Logger, configPath string, configContext string) (bool, error) { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return false, err + } + + // TODO(marun) Handle the case of the deployment being in a failed state + _, err = clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{}) + isRunning := !apierrors.IsNotFound(err) || err == nil + return isRunning, nil +} + +// waitForIngressController waits for the nginx ingress controller to be ready. +func waitForIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to get clientset: %w", err) + } + + return wait.PollUntilContextCancel(ctx, statusCheckInterval, true /* immediate */, func(ctx context.Context) (bool, error) { + deployment, err := clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{}) + if err != nil { + log.Debug("failed to get nginx ingress controller deployment", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Error(err), + ) + return false, nil + } + if deployment.Status.ReadyReplicas == 0 { + log.Debug("waiting for nginx ingress controller to become ready", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + zap.Int32("replicas", deployment.Status.Replicas), + ) + return false, nil + } + + log.Info("nginx ingress controller is ready", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + ) + return true, nil + }) +} + +// runHelmCommand runs a Helm command with the given arguments. +func runHelmCommand(ctx context.Context, args ...string) error { + cmd := exec.CommandContext(ctx, "helm", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} diff --git a/tests/fixture/tmpnet/utils.go b/tests/fixture/tmpnet/utils.go index e80070326900..121fe6a062bb 100644 --- a/tests/fixture/tmpnet/utils.go +++ b/tests/fixture/tmpnet/utils.go @@ -44,6 +44,14 @@ func CheckNodeHealth(ctx context.Context, uri string) (*health.APIReply, error) return nil, err } } + + // Assume `503 Service Unavailable` is the result of the ingress + // for the node not being ready. + // TODO(marun) Update Client.Health() to return a typed error + if err != nil && err.Error() == "received status code: 503" { + return nil, err + } + // Assume all other errors are not recoverable return nil, fmt.Errorf("%w: %w", ErrUnrecoverableNodeHealthCheck, err) } @@ -54,25 +62,18 @@ type NodeURI struct { URI string } -// GetNodeURIs returns the URIs of the provided nodes that are running and not ephemeral. The URIs returned -// are guaranteed be reachable by the caller until the cleanup function is called regardless of whether the -// nodes are running as local processes or in a kube cluster. -func GetNodeURIs(ctx context.Context, nodes []*Node, deferCleanupFunc func(func())) ([]NodeURI, error) { +// GetNodeURIs returns the accessible URIs of the provided nodes that are running and not ephemeral. +func GetNodeURIs(nodes []*Node) []NodeURI { availableNodes := FilterAvailableNodes(nodes) uris := []NodeURI{} for _, node := range availableNodes { - uri, cancel, err := node.GetLocalURI(ctx) - if err != nil { - return nil, err - } - deferCleanupFunc(cancel) uris = append(uris, NodeURI{ NodeID: node.NodeID, - URI: uri, + URI: node.GetAccessibleURI(), }) } - return uris, nil + return uris } // FilteredAvailableNodes filters the provided nodes by whether they are running and not ephemeral. @@ -96,15 +97,10 @@ func FilterAvailableNodes(nodes []*Node) []*Node { // blockchain ID, in the form "ws:///ext/bc//ws". // Ephemeral and stopped nodes are ignored. func GetNodeWebsocketURIs( - ctx context.Context, nodes []*Node, blockchainID string, - deferCleanupFunc func(func()), ) ([]string, error) { - nodeURIs, err := GetNodeURIs(ctx, nodes, deferCleanupFunc) - if err != nil { - return nil, fmt.Errorf("failed to get node URIs: %w", err) - } + nodeURIs := GetNodeURIs(nodes) wsURIs := make([]string, len(nodeURIs)) for i := range nodeURIs { uri, err := url.Parse(nodeURIs[i].URI) diff --git a/tests/load/c/main/main.go b/tests/load/c/main/main.go index d1673f4baab9..914616a9aad0 100644 --- a/tests/load/c/main/main.go +++ b/tests/load/c/main/main.go @@ -92,7 +92,7 @@ func main() { ) }) - endpoints, err := tmpnet.GetNodeWebsocketURIs(ctx, network.Nodes, blockchainID, tc.DeferCleanup) + endpoints, err := tmpnet.GetNodeWebsocketURIs(network.Nodes, blockchainID) require.NoError(err, "failed †o get node websocket URIs") w := &workload{ From 759e36724ab0390fdb3ad0a1bf88d7232314f7db Mon Sep 17 00:00:00 2001 From: maru Date: Thu, 26 Jun 2025 18:17:21 +0000 Subject: [PATCH 06/14] fixup: Update RBAC to support ingress deployment --- tests/fixture/tmpnet/kube_runtime.go | 6 +++--- tests/fixture/tmpnet/start_kind_cluster.go | 12 +++++++++--- tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml | 11 +++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 63903a191347..5d808c8dd74a 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -1083,7 +1083,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s // Check if ingress exists _, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { - log.Debug("waiting for Ingress to be created", + log.Verbo("waiting for Ingress to be created", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("ingress", serviceName), @@ -1103,7 +1103,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s // Check if service endpoints are available endpoints, err := clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { - log.Debug("waiting for Service endpoints to be created", + log.Verbo("waiting for Service endpoints to be created", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("service", serviceName), @@ -1130,7 +1130,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s } if !hasReadyEndpoints { - log.Debug("waiting for Service endpoints to have ready addresses", + log.Verbo("waiting for Service endpoints to have ready addresses", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("service", serviceName), diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index 4b49c179fcd6..cad7e186da39 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -239,13 +239,18 @@ func createServiceAccountKubeconfig( return fmt.Errorf("failed to load kubeconfig: %w", err) } - // Check if the context already exists if _, exists := config.Contexts[newContextName]; exists { - log.Info("service account kubeconfig context already exists", + log.Info("service account kubeconfig context exists, recreating to ensure consistency with cluster state", + zap.String("kubeconfig", configPath), + zap.String("context", newContextName), + zap.String("namespace", namespace), + ) + } else { + log.Info("creating new service account kubeconfig context", + zap.String("kubeconfig", configPath), zap.String("context", newContextName), zap.String("namespace", namespace), ) - return nil } // Get the current context (already verified to exist by StartKindCluster) @@ -291,6 +296,7 @@ func createServiceAccountKubeconfig( } log.Info("created service account kubeconfig context", + zap.String("kubeconfig", configPath), zap.String("context", newContextName), zap.String("namespace", namespace), ) diff --git a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml index 0d3056614a85..e7f230a72515 100644 --- a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml +++ b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml @@ -11,6 +11,7 @@ metadata: name: tmpnet namespace: tmpnet rules: +# Regular usage - apiGroups: ["apps"] resources: ["statefulsets"] verbs: ["get", "create", "update", "patch"] @@ -23,6 +24,16 @@ rules: - apiGroups: [""] resources: ["pods/portforward"] verbs: ["create"] +# Enable external node access via ingress +- apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "create"] +- apiGroups: [""] + resources: ["endpoints"] + verbs: ["get"] +- apiGroups: [""] + resources: ["services"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding From 7495d80b8472076461f67a76e2d81152a1d58c93 Mon Sep 17 00:00:00 2001 From: maru Date: Thu, 26 Jun 2025 18:24:36 +0000 Subject: [PATCH 07/14] fixup: Avoid deploying service or ingress when inside the cluster --- tests/fixture/tmpnet/kube_runtime.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 5d808c8dd74a..46dc1c641ee0 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -369,19 +369,18 @@ func (p *KubeRuntime) Start(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) - // Create Service for the node (prefix with 's-' for DNS compatibility) - serviceName := "s-" + statefulSetName - if err := p.createNodeService(ctx, serviceName); err != nil { - return fmt.Errorf("failed to create Service for node: %w", err) - } + if !IsRunningInCluster() { + // If running outside the cluster, ensure the node's API port is accessible via ingress - // Create Ingress for the node - if err := p.createNodeIngress(ctx, serviceName); err != nil { - return fmt.Errorf("failed to create Ingress for node: %w", err) - } + serviceName := "s-" + statefulSetName // The 's-' prefix ensures DNS compatibility + if err := p.createNodeService(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Service for node: %w", err) + } + + if err := p.createNodeIngress(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Ingress for node: %w", err) + } - // Wait for ingress to be ready if running outside cluster - if !IsRunningInCluster() { if err := p.waitForIngressReadiness(ctx, serviceName); err != nil { return fmt.Errorf("failed to wait for Ingress readiness: %w", err) } From f296385ba9b43633ba36059ac80aef2182549858 Mon Sep 17 00:00:00 2001 From: maru Date: Fri, 27 Jun 2025 01:01:30 +0000 Subject: [PATCH 08/14] fixup: Avoid deploying to non-kind cluster without base accessible uri --- .github/workflows/ci.yml | 1 + scripts/tests.e2e.kube.sh | 15 ++++++++- tests/fixture/tmpnet/flags/kube_runtime.go | 14 ++++++-- tests/fixture/tmpnet/kube_runtime.go | 39 +++++++++++++++++++--- tests/log.go | 3 +- 5 files changed, 63 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e57e857af5bc..3bd3fdd615c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -255,6 +255,7 @@ jobs: - uses: ./.github/actions/run-monitored-tmpnet-cmd with: run: ./scripts/run_task.sh test-load-kube + runtime: kube artifact_prefix: load-kube prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }} prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }} diff --git a/scripts/tests.e2e.kube.sh b/scripts/tests.e2e.kube.sh index 11c16104e7ae..d34067ed03dc 100755 --- a/scripts/tests.e2e.kube.sh +++ b/scripts/tests.e2e.kube.sh @@ -23,4 +23,17 @@ else XSVM_IMAGE="${XSVM_IMAGE}" AVALANCHEGO_IMAGE="${AVALANCHEGO_IMAGE}" bash -x ./scripts/build_xsvm_image.sh fi -bash -x ./scripts/tests.e2e.sh --runtime=kube --kube-image="${XSVM_IMAGE}" "$@" +# Determine kubeconfig context to use +KUBECONFIG_CONTEXT="" + +# Check if --kubeconfig-context is already provided in arguments +if [[ "$*" =~ --kubeconfig-context ]]; then + # User provided a context, use it as-is + echo "Using provided kubeconfig context from arguments" +else + # Default to the RBAC context + KUBECONFIG_CONTEXT="--kubeconfig-context=kind-kind-tmpnet" + echo "Defaulting to limited-permission context 'kind-kind-tmpnet' to test RBAC Role permissions" +fi + +bash -x ./scripts/tests.e2e.sh --runtime=kube --kube-image="${XSVM_IMAGE}" "$KUBECONFIG_CONTEXT" "$@" diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go index af43e9040363..067104cf733d 100644 --- a/tests/fixture/tmpnet/flags/kube_runtime.go +++ b/tests/fixture/tmpnet/flags/kube_runtime.go @@ -91,7 +91,7 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui stringVar( &v.baseAccessibleURI, "kube-base-accessible-uri", - "http://localhost:30791", + "", kubeDocPrefix+"The base URI for constructing node URIs when running outside of the cluster hosting nodes", ) } @@ -106,7 +106,15 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } - if !tmpnet.IsRunningInCluster() && len(v.baseAccessibleURI) == 0 { + baseAccessibleURI := v.baseAccessibleURI + if strings.HasPrefix(v.config.Context, "kind-kind") && len(baseAccessibleURI) == 0 { + // Use the base uri expected for the kind cluster deployed by tmpnet. Not supplying this as a default + // ensures that an explicit value is required for non-kind clusters. + // + // TODO(marun) Log why this value is being used. This will require passing a log through the call chain. + baseAccessibleURI = "http://localhost:30791" + } + if !tmpnet.IsRunningInCluster() && len(baseAccessibleURI) == 0 { return nil, errKubeBaseAccessibleURIRequired } return &tmpnet.KubeRuntimeConfig{ @@ -119,6 +127,6 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err SchedulingLabelKey: v.schedulingLabelKey, SchedulingLabelValue: v.schedulingLabelValue, // Strip trailing slashes to simplify path composition - BaseAccessibleURI: strings.TrimRight(v.baseAccessibleURI, "/"), + BaseAccessibleURI: strings.TrimRight(baseAccessibleURI, "/"), }, nil } diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 46dc1c641ee0..e73add3e64fb 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -1073,14 +1073,14 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s return err } - // Wait for the ingress to exist and service endpoints to be available + // Wait for the ingress to exist, be processed by the controller, and service endpoints to be available err = wait.PollUntilContextCancel( ctx, statusCheckInterval, true, // immediate func(ctx context.Context) (bool, error) { - // Check if ingress exists - _, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + // Check if ingress exists and is processed by the controller + ingress, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { log.Verbo("waiting for Ingress to be created", zap.String("nodeID", nodeID), @@ -1099,6 +1099,37 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s return false, nil } + // Check if ingress controller has processed the ingress + // The ingress controller should populate the Status.LoadBalancer.Ingress field + // when it has successfully processed and exposed the ingress + hasIngressIP := len(ingress.Status.LoadBalancer.Ingress) > 0 + if !hasIngressIP { + log.Verbo("waiting for Ingress controller to process and expose the Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + + // Validate that at least one ingress has an IP or hostname + hasValidIngress := false + for _, ing := range ingress.Status.LoadBalancer.Ingress { + if ing.IP != "" || ing.Hostname != "" { + hasValidIngress = true + break + } + } + + if !hasValidIngress { + log.Verbo("waiting for Ingress controller to assign IP or hostname", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + // Check if service endpoints are available endpoints, err := clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { @@ -1137,7 +1168,7 @@ func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName s return false, nil } - log.Debug("Ingress and Service endpoints are ready", + log.Debug("Ingress is exposed by controller and Service endpoints are ready", zap.String("nodeID", nodeID), zap.String("namespace", namespace), zap.String("ingress", serviceName), diff --git a/tests/log.go b/tests/log.go index a431feb3c1ec..134cc9cd1f78 100644 --- a/tests/log.go +++ b/tests/log.go @@ -25,5 +25,6 @@ func LoggerForFormat(prefix string, rawLogFormat string) (logging.Logger, error) if err != nil { return nil, err } - return logging.NewLogger(prefix, logging.NewWrappedCore(logging.Verbo, writeCloser, logFormat.ConsoleEncoder())), nil + // TODO(marun) Make the log level configurable + return logging.NewLogger(prefix, logging.NewWrappedCore(logging.Debug, writeCloser, logFormat.ConsoleEncoder())), nil } From d4348dd1e7013948d47c8549ad6172f2c7e41ef8 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 1 Jul 2025 03:20:45 +0000 Subject: [PATCH 09/14] fixup: Source ingress configuration from the cluster --- tests/fixture/tmpnet/flags/kube_runtime.go | 28 +--- tests/fixture/tmpnet/kube_runtime.go | 152 ++++++++++++++------- tests/fixture/tmpnet/start_kind_cluster.go | 49 +++++++ tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml | 3 + tests/load2/main/main.go | 2 +- 5 files changed, 159 insertions(+), 75 deletions(-) diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go index 067104cf733d..3eee81ea99d9 100644 --- a/tests/fixture/tmpnet/flags/kube_runtime.go +++ b/tests/fixture/tmpnet/flags/kube_runtime.go @@ -7,7 +7,6 @@ import ( "errors" "flag" "fmt" - "strings" "github.com/spf13/pflag" @@ -21,10 +20,9 @@ const ( ) var ( - errKubeNamespaceRequired = errors.New("--kube-namespace is required") - errKubeImageRequired = errors.New("--kube-image is required") - errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) - errKubeBaseAccessibleURIRequired = errors.New("--kube-base-accessible-uri is required when running outside of cluster") + errKubeNamespaceRequired = errors.New("--kube-namespace is required") + errKubeImageRequired = errors.New("--kube-image is required") + errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) ) type kubeRuntimeVars struct { @@ -34,7 +32,6 @@ type kubeRuntimeVars struct { useExclusiveScheduling bool schedulingLabelKey string schedulingLabelValue string - baseAccessibleURI string config *KubeconfigVars } @@ -88,12 +85,6 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui "", kubeDocPrefix+"The label value to use for exclusive scheduling for node selection and toleration", ) - stringVar( - &v.baseAccessibleURI, - "kube-base-accessible-uri", - "", - kubeDocPrefix+"The base URI for constructing node URIs when running outside of the cluster hosting nodes", - ) } func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, error) { @@ -106,17 +97,6 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } - baseAccessibleURI := v.baseAccessibleURI - if strings.HasPrefix(v.config.Context, "kind-kind") && len(baseAccessibleURI) == 0 { - // Use the base uri expected for the kind cluster deployed by tmpnet. Not supplying this as a default - // ensures that an explicit value is required for non-kind clusters. - // - // TODO(marun) Log why this value is being used. This will require passing a log through the call chain. - baseAccessibleURI = "http://localhost:30791" - } - if !tmpnet.IsRunningInCluster() && len(baseAccessibleURI) == 0 { - return nil, errKubeBaseAccessibleURIRequired - } return &tmpnet.KubeRuntimeConfig{ ConfigPath: v.config.Path, ConfigContext: v.config.Context, @@ -126,7 +106,5 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err UseExclusiveScheduling: v.useExclusiveScheduling, SchedulingLabelKey: v.schedulingLabelKey, SchedulingLabelValue: v.schedulingLabelValue, - // Strip trailing slashes to simplify path composition - BaseAccessibleURI: strings.TrimRight(baseAccessibleURI, "/"), }, nil } diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index e73add3e64fb..1c2eb7f168b8 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -55,9 +55,13 @@ const ( // Name of config map containing tmpnet defaults defaultsConfigMapName = "tmpnet-defaults" + ingressHostKey = "ingressHost" ) -var errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") +var ( + errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") + errMissingIngressHost = errors.New("IngressHost is a required value. Ensure the " + defaultsConfigMapName + " ConfigMap contains an entry for " + ingressHostKey) +) type KubeRuntimeConfig struct { // Path to the kubeconfig file identifying the target cluster @@ -78,14 +82,18 @@ type KubeRuntimeConfig struct { SchedulingLabelKey string `json:"schedulingLabelKey,omitempty"` // Label value to use for exclusive scheduling for node selection and toleration SchedulingLabelValue string `json:"schedulingLabelValue,omitempty"` - // Base URI for constructing node URIs when running outside of the cluster hosting nodes (e.g., "http://localhost:30791") - BaseAccessibleURI string `json:"baseAccessibleURI,omitempty"` + // Host for ingress rules (e.g., "localhost:30791" for kind, "tmpnet.example.com" for EKS) + IngressHost string `json:"ingressHost,omitempty"` + // TLS secret name for ingress (empty for HTTP, populated for HTTPS) + IngressSecret string `json:"ingressSecret,omitempty"` } // ensureDefaults sets cluster-specific defaults for fields not already set by flags. func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logger) error { + // Only read defaults if necessary requireSchedulingDefaults := c.UseExclusiveScheduling && (len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0) - if !requireSchedulingDefaults { + requireIngressDefaults := !IsRunningInCluster() && len(c.IngressHost) == 0 + if !requireSchedulingDefaults && !requireIngressDefaults { return nil } @@ -104,26 +112,47 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg return fmt.Errorf("failed to get ConfigMap: %w", err) } - var ( - schedulingLabelKey = configMap.Data["schedulingLabelKey"] - schedulingLabelValue = configMap.Data["schedulingLabelValue"] - ) - if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { - log.Info("setting default value for SchedulingLabelKey", - zap.String("schedulingLabelKey", schedulingLabelKey), + if requireSchedulingDefaults { + var ( + schedulingLabelKey = configMap.Data["schedulingLabelKey"] + schedulingLabelValue = configMap.Data["schedulingLabelValue"] ) - c.SchedulingLabelKey = schedulingLabelKey + if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { + log.Info("setting default value for SchedulingLabelKey", + zap.String("schedulingLabelKey", schedulingLabelKey), + ) + c.SchedulingLabelKey = schedulingLabelKey + } + if len(c.SchedulingLabelValue) == 0 && len(schedulingLabelValue) > 0 { + log.Info("setting default value for SchedulingLabelValue", + zap.String("schedulingLabelValue", schedulingLabelValue), + ) + c.SchedulingLabelValue = schedulingLabelValue + } + if len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0 { + return errMissingSchedulingLabels + } } - if len(c.SchedulingLabelValue) == 0 && len(schedulingLabelValue) > 0 { - log.Info("setting default value for SchedulingLabelValue", - zap.String("schedulingLabelValue", schedulingLabelValue), + if requireIngressDefaults { + var ( + ingressHost = configMap.Data[ingressHostKey] + ingressSecret = configMap.Data["ingressSecret"] ) - c.SchedulingLabelValue = schedulingLabelValue - } - - // Validate that the scheduling labels are now set - if len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0 { - return errMissingSchedulingLabels + if len(c.IngressHost) == 0 && len(ingressHost) > 0 { + log.Info("setting default value for IngressHost", + zap.String("ingressHost", ingressHost), + ) + c.IngressHost = ingressHost + } + if len(c.IngressSecret) == 0 && len(ingressSecret) > 0 { + log.Info("setting default value for IngressSecret", + zap.String("ingressSecret", ingressSecret), + ) + c.IngressSecret = ingressSecret + } + if len(c.IngressHost) == 0 { + return errMissingIngressHost + } } return nil @@ -150,8 +179,8 @@ func (p *KubeRuntime) readState(ctx context.Context) error { ) // Validate that it will be possible to construct accessible URIs when running external to the kube cluster - if !IsRunningInCluster() && len(runtimeConfig.BaseAccessibleURI) == 0 { - return errors.New("BaseAccessibleURI must be set when running outside of the kubernetes cluster") + if !IsRunningInCluster() && len(runtimeConfig.IngressHost) == 0 { + return errors.New("IngressHost must be set when running outside of the kubernetes cluster") } clientset, err := p.getClientset() @@ -207,11 +236,18 @@ func (p *KubeRuntime) GetAccessibleURI() string { return p.node.URI } - baseURI := p.runtimeConfig().BaseAccessibleURI - nodeID := p.node.NodeID.String() - networkUUID := p.node.network.UUID + var ( + protocol = "http" + nodeID = p.node.NodeID.String() + networkUUID = p.node.network.UUID + runtimeConfig = p.runtimeConfig() + ) + // Assume tls is configured for an ingress secret + if len(runtimeConfig.IngressSecret) > 0 { + protocol = "https" + } - return fmt.Sprintf("%s/networks/%s/%s", baseURI, networkUUID, nodeID) + return fmt.Sprintf("%s://%s/networks/%s/%s", protocol, runtimeConfig.IngressHost, networkUUID, nodeID) } // GetAccessibleStakingAddress retrieves a StakingAddress for the node intended to be @@ -993,6 +1029,35 @@ func (p *KubeRuntime) createNodeIngress(ctx context.Context, serviceName string) pathType = networkingv1.PathTypeImplementationSpecific ) + // Build the ingress rules + ingressRules := []networkingv1.IngressRule{ + { + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{ + { + Path: pathPattern, + PathType: &pathType, + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: serviceName, + Port: networkingv1.ServiceBackendPort{ + Number: config.DefaultHTTPPort, + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Add host if not localhost + if !strings.HasPrefix(runtimeConfig.IngressHost, "localhost") { + ingressRules[0].Host = runtimeConfig.IngressHost + } + ingress := &networkingv1.Ingress{ ObjectMeta: metav1.ObjectMeta{ Name: serviceName, @@ -1012,31 +1077,20 @@ func (p *KubeRuntime) createNodeIngress(ctx context.Context, serviceName string) }, Spec: networkingv1.IngressSpec{ IngressClassName: &ingressClassName, - Rules: []networkingv1.IngressRule{ - { - IngressRuleValue: networkingv1.IngressRuleValue{ - HTTP: &networkingv1.HTTPIngressRuleValue{ - Paths: []networkingv1.HTTPIngressPath{ - { - Path: pathPattern, - PathType: &pathType, - Backend: networkingv1.IngressBackend{ - Service: &networkingv1.IngressServiceBackend{ - Name: serviceName, - Port: networkingv1.ServiceBackendPort{ - Number: config.DefaultHTTPPort, - }, - }, - }, - }, - }, - }, - }, - }, - }, + Rules: ingressRules, }, } + // Add TLS configuration if IngressSecret is set + if len(runtimeConfig.IngressSecret) > 0 && !strings.HasPrefix(runtimeConfig.IngressHost, "localhost") { + ingress.Spec.TLS = []networkingv1.IngressTLS{ + { + Hosts: []string{runtimeConfig.IngressHost}, + SecretName: runtimeConfig.IngressSecret, + }, + } + } + _, err = clientset.NetworkingV1().Ingresses(namespace).Create(ctx, ingress, metav1.CreateOptions{}) if err != nil { return fmt.Errorf("failed to create Ingress: %w", err) diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index cad7e186da39..f8d4299875c7 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -112,6 +112,10 @@ func StartKindCluster( return fmt.Errorf("failed to deploy ingress controller: %w", err) } + if err := createDefaultsConfigMap(ctx, log, configPath, configContext, DefaultTmpnetNamespace); err != nil { + return fmt.Errorf("failed to create defaults ConfigMap: %w", err) + } + return nil } @@ -410,3 +414,48 @@ func runHelmCommand(ctx context.Context, args ...string) error { cmd.Stderr = os.Stderr return cmd.Run() } + +// createDefaultsConfigMap creates a ConfigMap containing defaults for the tmpnet namespace. +func createDefaultsConfigMap(ctx context.Context, log logging.Logger, configPath string, configContext string, namespace string) error { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to get clientset: %w", err) + } + + configMapName := defaultsConfigMapName + + // Check if configmap already exists + _, err = clientset.CoreV1().ConfigMaps(namespace).Get(ctx, configMapName, metav1.GetOptions{}) + if err == nil { + log.Info("defaults ConfigMap already exists", + zap.String("namespace", namespace), + zap.String("configMap", configMapName), + ) + return nil + } + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to check for configmap %s/%s: %w", namespace, configMapName, err) + } + + log.Info("creating defaults ConfigMap", + zap.String("namespace", namespace), + zap.String("configMap", configMapName), + ) + + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: namespace, + }, + Data: map[string]string{ + ingressHostKey: "localhost:30791", + }, + } + + _, err = clientset.CoreV1().ConfigMaps(namespace).Create(ctx, configMap, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create configmap %s/%s: %w", namespace, configMapName, err) + } + + return nil +} diff --git a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml index e7f230a72515..21db7582fd0f 100644 --- a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml +++ b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml @@ -28,6 +28,9 @@ rules: - apiGroups: ["networking.k8s.io"] resources: ["ingresses"] verbs: ["get", "create"] +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get"] - apiGroups: [""] resources: ["endpoints"] verbs: ["get"] diff --git a/tests/load2/main/main.go b/tests/load2/main/main.go index 332c2a844108..55a947215ed8 100644 --- a/tests/load2/main/main.go +++ b/tests/load2/main/main.go @@ -67,7 +67,7 @@ func main() { e2e.NewTestEnvironment(tc, flagVars, network) ctx := tests.DefaultNotifyContext(0, tc.DeferCleanup) - wsURIs, err := tmpnet.GetNodeWebsocketURIs(ctx, network.Nodes, blockchainID, tc.DeferCleanup) + wsURIs, err := tmpnet.GetNodeWebsocketURIs(network.Nodes, blockchainID) require.NoError(err) registry := prometheus.NewRegistry() From 4d7e18ae10c2ecd5fee62860487ca2e0c97149d3 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 1 Jul 2025 10:27:32 +0000 Subject: [PATCH 10/14] fixup: Cache kubeconfig --- tests/fixture/tmpnet/kube_runtime.go | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 1c2eb7f168b8..1e4b1e5394b1 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -160,6 +160,8 @@ func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logg type KubeRuntime struct { node *Node + + kubeConfig *restclient.Config } // readState reads the URI and staking address for the node if the node is running. @@ -818,13 +820,23 @@ func (p *KubeRuntime) runtimeConfig() *KubeRuntimeConfig { return p.node.getRuntimeConfig().Kube } +// getKubeconfig retrieves the kubeconfig for the target cluster. It +// will be cached after the first call to avoid unnecessary logging +// when running in-cluster. func (p *KubeRuntime) getKubeconfig() (*restclient.Config, error) { - runtimeConfig := p.runtimeConfig() - return GetClientConfig( - p.node.network.log, - runtimeConfig.ConfigPath, - runtimeConfig.ConfigContext, - ) + if p.kubeConfig == nil { + runtimeConfig := p.runtimeConfig() + config, err := GetClientConfig( + p.node.network.log, + runtimeConfig.ConfigPath, + runtimeConfig.ConfigContext, + ) + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + p.kubeConfig = config + } + return p.kubeConfig, nil } func (p *KubeRuntime) getClientset() (*kubernetes.Clientset, error) { From 48be66f8af2a37817508f28d12869d561ee7aaf3 Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 05:12:11 +0000 Subject: [PATCH 11/14] fixup: Ensure test-e2e-kube-ci runs serially --- Taskfile.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index e0a89f4612f3..21b2bf813ae7 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -203,7 +203,7 @@ tasks: env: E2E_SERIAL: 1 cmds: - - task: test-e2e-kube + - cmd: bash -x ./scripts/tests.e2e.kube.sh {{.CLI_ARGS}} # To use a different fuzz time, run `task test-fuzz FUZZTIME=[value in seconds]`. # A value of `-1` will run until it encounters a failing output. @@ -241,7 +241,7 @@ tasks: test-load2: desc: Runs second iteration of load tests - cmds: + cmds: - task: build - cmd: go run ./tests/load2/main --avalanchego-path=./build/avalanchego {{.CLI_ARGS}} From e2231cd962b0093f353abff1021649587ae428fe Mon Sep 17 00:00:00 2001 From: maru Date: Tue, 8 Jul 2025 05:47:27 +0000 Subject: [PATCH 12/14] fixup: Ensure compatibility with non-localhost ingress host --- tests/fixture/tmpnet/kube_runtime.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index 1e4b1e5394b1..ab86956291e3 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -903,6 +903,10 @@ func (p *KubeRuntime) getFlags() (FlagsMap, error) { flags[config.DataDirKey] = volumeMountPath // The node must bind to the Pod IP to enable the kubelet to access the http port for the readiness check flags[config.HTTPHostKey] = "0.0.0.0" + // Ensure compatibility with a non-localhost ingress host + if !IsRunningInCluster() && !strings.HasPrefix(p.runtimeConfig().IngressHost, "localhost") { + flags[config.HTTPAllowedHostsKey] = p.runtimeConfig().IngressHost + } return flags, nil } From 88c15bae69745cfdfacbf3441b991931a623eec9 Mon Sep 17 00:00:00 2001 From: maru Date: Thu, 3 Jul 2025 01:38:54 +0000 Subject: [PATCH 13/14] [tmpnet] Move tmpnet-specific test helpers to the tmpnet package --- tests/antithesis/config.go | 9 +- tests/e2e/e2e_test.go | 5 +- tests/fixture/e2e/helpers.go | 88 ++------------- tests/fixture/e2e/metrics_link.go | 3 +- .../flags.go => tmpnet/flags/flag_vars.go} | 11 +- .../testenv/test_environment.go} | 100 ++++++++++++++++-- tests/load/c/main/main.go | 5 +- tests/load2/main/main.go | 9 +- tests/upgrade/upgrade_test.go | 4 +- 9 files changed, 124 insertions(+), 110 deletions(-) rename tests/fixture/{e2e/flags.go => tmpnet/flags/flag_vars.go} (94%) rename tests/fixture/{e2e/env.go => tmpnet/testenv/test_environment.go} (74%) diff --git a/tests/antithesis/config.go b/tests/antithesis/config.go index e38a1e5b8ca9..d36c866935cc 100644 --- a/tests/antithesis/config.go +++ b/tests/antithesis/config.go @@ -13,8 +13,9 @@ import ( "github.com/ava-labs/avalanchego/config" "github.com/ava-labs/avalanchego/tests" - "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/testenv" ) const ( @@ -39,7 +40,7 @@ func NewConfig(tc tests.TestContext, defaultNetwork *tmpnet.Network) *Config { func NewConfigWithSubnets(tc tests.TestContext, defaultNetwork *tmpnet.Network, getSubnets SubnetsForNodesFunc) *Config { // tmpnet configuration - flagVars := e2e.RegisterFlags() + flagVars := flags.RegisterFlags() var ( duration time.Duration @@ -85,7 +86,7 @@ func configForNewNetwork( tc tests.TestContext, defaultNetwork *tmpnet.Network, getSubnets SubnetsForNodesFunc, - flagVars *e2e.FlagVars, + flagVars *flags.FlagVars, duration time.Duration, ) *Config { if defaultNetwork.Nodes == nil { @@ -97,7 +98,7 @@ func configForNewNetwork( defaultNetwork.Subnets = getSubnets(defaultNetwork.Nodes...) } - testEnv := e2e.NewTestEnvironment(tc, flagVars, defaultNetwork) + testEnv := testenv.NewTestEnvironment(tc, flagVars, defaultNetwork) c := &Config{ Duration: duration, diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 88acfda1688a..7972213a92a0 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -24,6 +24,7 @@ import ( "github.com/ava-labs/avalanchego/tests/e2e/vms" "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" "github.com/ava-labs/avalanchego/upgrade" ) @@ -31,10 +32,10 @@ func TestE2E(t *testing.T) { ginkgo.RunSpecs(t, "e2e test suites") } -var flagVars *e2e.FlagVars +var flagVars *flags.FlagVars func init() { - flagVars = e2e.RegisterFlagsWithDefaultOwner("avalanchego-e2e") + flagVars = flags.RegisterFlagsWithDefaultOwner("avalanchego-e2e") } var _ = ginkgo.SynchronizedBeforeSuite(func() []byte { diff --git a/tests/fixture/e2e/helpers.go b/tests/fixture/e2e/helpers.go index 49d854798761..086920bba244 100644 --- a/tests/fixture/e2e/helpers.go +++ b/tests/fixture/e2e/helpers.go @@ -20,6 +20,7 @@ import ( "github.com/ava-labs/avalanchego/config" "github.com/ava-labs/avalanchego/tests" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/testenv" "github.com/ava-labs/avalanchego/utils/crypto/secp256k1" "github.com/ava-labs/avalanchego/vms/platformvm/txs/fee" "github.com/ava-labs/avalanchego/vms/secp256k1fx" @@ -49,6 +50,14 @@ const ( PrivateNetworksDirName = "private_networks" ) +var ( + // testenv helpers for creating and managing test environments + GetEnv = testenv.GetEnv + NewTestEnvironment = testenv.NewTestEnvironment + InitSharedTestEnvironment = testenv.InitSharedTestEnvironment + StartNetwork = testenv.StartNetwork +) + // NewPrivateKey returns a new private key. func NewPrivateKey(tc tests.TestContext) *secp256k1.PrivateKey { key, err := secp256k1.NewPrivateKey() @@ -270,85 +279,6 @@ func CheckBootstrapIsPossible(tc tests.TestContext, network *tmpnet.Network) *tm return node } -// Start a temporary network with the provided avalanchego binary. -func StartNetwork( - tc tests.TestContext, - network *tmpnet.Network, - rootNetworkDir string, - shutdownDelay time.Duration, - networkCmd NetworkCmd, -) { - require := require.New(tc) - - nodeCount := len(network.Nodes) - timeout, err := network.DefaultRuntimeConfig.GetNetworkStartTimeout(nodeCount) - require.NoError(err) - tc.Log().Info("waiting for network to start", - zap.Float64("timeoutSeconds", timeout.Seconds()), - ) - ctx := tc.ContextWithTimeout(timeout) - - err = tmpnet.BootstrapNewNetwork( - ctx, - tc.Log(), - network, - rootNetworkDir, - ) - if err != nil { - tc.DeferCleanup(func() { - tc.Log().Info("shutting down network") - ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) - defer cancel() - require.NoError(network.Stop(ctx)) - }) - require.NoError(err, "failed to bootstrap network") - } - - tc.Log().Info("network started successfully") - - symlinkPath, err := tmpnet.GetReusableNetworkPathForOwner(network.Owner) - require.NoError(err) - - if networkCmd == ReuseNetworkCmd || networkCmd == RestartNetworkCmd { - // Symlink the path of the created network to the default owner path (e.g. latest_avalanchego-e2e) - // to enable easy discovery for reuse. - require.NoError(os.Symlink(network.Dir, symlinkPath)) - tc.Log().Info("symlinked network dir for reuse", - zap.String("networkDir", network.Dir), - zap.String("symlinkPath", symlinkPath), - ) - } - - tc.DeferCleanup(func() { - if networkCmd == ReuseNetworkCmd || networkCmd == RestartNetworkCmd { - tc.Log().Info("skipping shutdown for network intended for reuse", - zap.String("networkDir", network.Dir), - zap.String("symlinkPath", symlinkPath), - ) - return - } - - if networkCmd == StartNetworkCmd { - tc.Log().Info("skipping shutdown for --start-network", - zap.String("networkDir", network.Dir), - ) - return - } - - if shutdownDelay > 0 { - tc.Log().Info("delaying network shutdown to ensure final metrics scrape", - zap.Duration("delay", shutdownDelay), - ) - time.Sleep(shutdownDelay) - } - - tc.Log().Info("shutting down network") - ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) - defer cancel() - require.NoError(network.Stop(ctx)) - }) -} - // NewPChainFeeCalculatorFromContext returns either a static or dynamic fee // calculator depending on the provided context. func NewPChainFeeCalculatorFromContext(context *builder.Context) fee.Calculator { diff --git a/tests/fixture/e2e/metrics_link.go b/tests/fixture/e2e/metrics_link.go index c7a164df0130..7d46b537a383 100644 --- a/tests/fixture/e2e/metrics_link.go +++ b/tests/fixture/e2e/metrics_link.go @@ -11,6 +11,7 @@ import ( "go.uber.org/zap" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/testenv" ) // The ginkgo event handlers defined in this file will be automatically @@ -32,7 +33,7 @@ var _ = ginkgo.BeforeEach(func() { // of the current spec. var _ = ginkgo.AfterEach(func() { tc := NewTestContext() - env := GetEnv(tc) + env := testenv.GetEnv(tc) // The global env isn't guaranteed to be initialized by importers // of this package since initializing a package-local env is also // supported. diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/tmpnet/flags/flag_vars.go similarity index 94% rename from tests/fixture/e2e/flags.go rename to tests/fixture/tmpnet/flags/flag_vars.go index 63dc048e8294..ec55ecc68151 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/tmpnet/flags/flag_vars.go @@ -1,7 +1,7 @@ // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. // See the file LICENSE for licensing terms. -package e2e +package flags import ( "errors" @@ -13,7 +13,6 @@ import ( "github.com/spf13/cast" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" - "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" ) type NetworkCmd int @@ -28,9 +27,9 @@ const ( type FlagVars struct { startNetwork bool - startNetworkVars *flags.StartNetworkVars + startNetworkVars *StartNetworkVars - collectorVars *flags.CollectorVars + collectorVars *CollectorVars checkMetricsCollected bool checkLogsCollected bool @@ -138,9 +137,9 @@ func RegisterFlagsWithDefaultOwner(defaultOwner string) *FlagVars { "[optional] start a new network and exit without executing any tests. The new network cannot be reused with --reuse-network.", ) - vars.startNetworkVars = flags.NewStartNetworkFlagVars(defaultOwner) + vars.startNetworkVars = NewStartNetworkFlagVars(defaultOwner) - vars.collectorVars = flags.NewCollectorFlagVars() + vars.collectorVars = NewCollectorFlagVars() SetCheckCollectionFlags( &vars.checkMetricsCollected, diff --git a/tests/fixture/e2e/env.go b/tests/fixture/tmpnet/testenv/test_environment.go similarity index 74% rename from tests/fixture/e2e/env.go rename to tests/fixture/tmpnet/testenv/test_environment.go index f68ed022c1bf..2c5f89afae2c 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/tmpnet/testenv/test_environment.go @@ -1,7 +1,7 @@ // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. // See the file LICENSE for licensing terms. -package e2e +package testenv import ( "context" @@ -17,6 +17,7 @@ import ( "github.com/ava-labs/avalanchego/tests" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" "github.com/ava-labs/avalanchego/utils/crypto/secp256k1" "github.com/ava-labs/avalanchego/vms/secp256k1fx" ) @@ -77,7 +78,7 @@ func (te *TestEnvironment) Marshal() []byte { } // Initialize a new test environment with a shared network (either pre-existing or newly created). -func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment { +func NewTestEnvironment(tc tests.TestContext, flagVars *flags.FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment { require := require.New(tc) var network *tmpnet.Network @@ -86,7 +87,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork require.NoError(err) // Consider monitoring flags for any command but stop - if networkCmd != StopNetworkCmd { + if networkCmd != flags.StopNetworkCmd { if flagVars.StartMetricsCollector() { require.NoError(tmpnet.StartPrometheus(tc.DefaultContext(), tc.Log())) } @@ -102,7 +103,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork tc.Log().Warn("unable to check that metrics were collected from an uninitialized network") return } - ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) defer cancel() require.NoError(tmpnet.CheckMetricsExist(ctx, tc.Log(), network.UUID)) }) @@ -114,7 +115,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork tc.Log().Warn("unable to check that logs were collected from an uninitialized network") return } - ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) defer cancel() require.NoError(tmpnet.CheckLogsExist(ctx, tc.Log(), network.UUID)) }) @@ -122,7 +123,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } // Attempt to load the network if it may already be running - if networkCmd == StopNetworkCmd || networkCmd == ReuseNetworkCmd || networkCmd == RestartNetworkCmd { + if networkCmd == flags.StopNetworkCmd || networkCmd == flags.ReuseNetworkCmd || networkCmd == flags.RestartNetworkCmd { networkDir := flagVars.NetworkDir() var networkSymlink string // If populated, prompts removal of the referenced symlink if --stop-network is specified if len(networkDir) == 0 { @@ -148,7 +149,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork ) } - if networkCmd == StopNetworkCmd { + if networkCmd == flags.StopNetworkCmd { if len(networkSymlink) > 0 { // Remove the symlink to avoid attempts to reuse the stopped network tc.Log().Info("removing symlink", @@ -167,7 +168,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork os.Exit(0) } - if network != nil && networkCmd == RestartNetworkCmd { + if network != nil && networkCmd == flags.RestartNetworkCmd { require.NoError(network.Restart(tc.DefaultContext())) } } @@ -204,7 +205,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } } - if networkCmd == StartNetworkCmd { + if networkCmd == flags.StartNetworkCmd { os.Exit(0) } @@ -294,6 +295,85 @@ func (te *TestEnvironment) StartPrivateNetwork(network *tmpnet.Network) { network, te.RootNetworkDir, te.PrivateNetworkShutdownDelay, - EmptyNetworkCmd, + flags.EmptyNetworkCmd, ) } + +// Start a temporary network with the provided avalanchego binary. +func StartNetwork( + tc tests.TestContext, + network *tmpnet.Network, + rootNetworkDir string, + shutdownDelay time.Duration, + networkCmd flags.NetworkCmd, +) { + require := require.New(tc) + + nodeCount := len(network.Nodes) + timeout, err := network.DefaultRuntimeConfig.GetNetworkStartTimeout(nodeCount) + require.NoError(err) + tc.Log().Info("waiting for network to start", + zap.Float64("timeoutSeconds", timeout.Seconds()), + ) + ctx := tc.ContextWithTimeout(timeout) + + err = tmpnet.BootstrapNewNetwork( + ctx, + tc.Log(), + network, + rootNetworkDir, + ) + if err != nil { + tc.DeferCleanup(func() { + tc.Log().Info("shutting down network") + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) + defer cancel() + require.NoError(network.Stop(ctx)) + }) + require.NoError(err, "failed to bootstrap network") + } + + tc.Log().Info("network started successfully") + + symlinkPath, err := tmpnet.GetReusableNetworkPathForOwner(network.Owner) + require.NoError(err) + + if networkCmd == flags.ReuseNetworkCmd || networkCmd == flags.RestartNetworkCmd { + // Symlink the path of the created network to the default owner path (e.g. latest_avalanchego-e2e) + // to enable easy discovery for reuse. + require.NoError(os.Symlink(network.Dir, symlinkPath)) + tc.Log().Info("symlinked network dir for reuse", + zap.String("networkDir", network.Dir), + zap.String("symlinkPath", symlinkPath), + ) + } + + tc.DeferCleanup(func() { + if networkCmd == flags.ReuseNetworkCmd || networkCmd == flags.RestartNetworkCmd { + tc.Log().Info("skipping shutdown for network intended for reuse", + zap.String("networkDir", network.Dir), + zap.String("symlinkPath", symlinkPath), + ) + return + } + + if networkCmd == flags.StartNetworkCmd { + tc.Log().Info("skipping shutdown for --start-network", + zap.String("networkDir", network.Dir), + ) + return + } + + if shutdownDelay > 0 { + tc.Log().Info("delaying network shutdown to ensure final metrics scrape", + zap.Duration("delay", shutdownDelay), + ) + time.Sleep(shutdownDelay) + } + + tc.Log().Info("shutting down network") + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) + defer cancel() + require.NoError(network.Stop(ctx)) + }) +} diff --git a/tests/load/c/main/main.go b/tests/load/c/main/main.go index 914616a9aad0..210c6fff3f3c 100644 --- a/tests/load/c/main/main.go +++ b/tests/load/c/main/main.go @@ -19,6 +19,7 @@ import ( "github.com/ava-labs/avalanchego/tests" "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" "github.com/ava-labs/avalanchego/tests/load" "github.com/ava-labs/avalanchego/tests/load/c" "github.com/ava-labs/avalanchego/tests/load/c/listener" @@ -35,10 +36,10 @@ const ( logPrefix = "avalanchego-load-test" ) -var flagVars *e2e.FlagVars +var flagVars *flags.FlagVars func init() { - flagVars = e2e.RegisterFlags() + flagVars = flags.RegisterFlags() flag.Parse() } diff --git a/tests/load2/main/main.go b/tests/load2/main/main.go index 55a947215ed8..a99140d7c7b5 100644 --- a/tests/load2/main/main.go +++ b/tests/load2/main/main.go @@ -13,8 +13,9 @@ import ( "github.com/stretchr/testify/require" "github.com/ava-labs/avalanchego/tests" - "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/testenv" "github.com/ava-labs/avalanchego/tests/load" "github.com/ava-labs/avalanchego/tests/load2" ) @@ -27,13 +28,13 @@ const ( ) var ( - flagVars *e2e.FlagVars + flagVars *flags.FlagVars loadTimeout time.Duration ) func init() { - flagVars = e2e.RegisterFlags() + flagVars = flags.RegisterFlags() flag.DurationVar( &loadTimeout, @@ -64,7 +65,7 @@ func main() { PreFundedKeys: keys, } - e2e.NewTestEnvironment(tc, flagVars, network) + testenv.NewTestEnvironment(tc, flagVars, network) ctx := tests.DefaultNotifyContext(0, tc.DeferCleanup) wsURIs, err := tmpnet.GetNodeWebsocketURIs(network.Nodes, blockchainID) diff --git a/tests/upgrade/upgrade_test.go b/tests/upgrade/upgrade_test.go index 1b35f4ed99c9..59658d8263fd 100644 --- a/tests/upgrade/upgrade_test.go +++ b/tests/upgrade/upgrade_test.go @@ -44,7 +44,7 @@ func init() { "avalanchego executable path to upgrade to", ) collectorVars = flags.NewCollectorFlagVars() - e2e.SetCheckCollectionFlags( + flags.SetCheckCollectionFlags( &checkMetricsCollected, &checkLogsCollected, ) @@ -99,7 +99,7 @@ var _ = ginkgo.Describe("[Upgrade]", func() { network, "", /* rootNetworkDir */ shutdownDelay, - e2e.EmptyNetworkCmd, + flags.EmptyNetworkCmd, ) tc.By(fmt.Sprintf("restarting all nodes with %q binary", avalancheGoExecPathToUpgradeTo)) From 32bb59c69ee4c8763814800ac13811c1931e2491 Mon Sep 17 00:00:00 2001 From: maru Date: Wed, 9 Jul 2025 03:07:32 +0000 Subject: [PATCH 14/14] [tmpnet] Revise e2e and tmpnet READMEs --- tests/e2e/README.md | 32 +-- tests/fixture/tmpnet/README.md | 452 ++++++++++++++++++++++++++++++--- 2 files changed, 420 insertions(+), 64 deletions(-) diff --git a/tests/e2e/README.md b/tests/e2e/README.md index 65bc1ddf4677..d7509884b95e 100644 --- a/tests/e2e/README.md +++ b/tests/e2e/README.md @@ -40,7 +40,7 @@ queries](https://onsi.github.io/ginkgo/#spec-labels). ## Adding tests -Define any flags/configurations in [`flags.go`](../fixture/e2e/flags.go). +Configure test behavior using the flags defined in the [tmpnet flags package](../fixture/tmpnet/flags/). For detailed flag documentation, see the [tmpnet configuration flags documentation](../fixture/tmpnet/README.md#configuration-flags). Create a new package to implement feature-specific tests, or add tests to an existing package. For example: @@ -110,30 +110,10 @@ E2E_SKIP_BOOTSTRAP_CHECKS=1 ./bin/ginkgo -v ./tests/e2e ... ## Monitoring -It is possible to enable collection of logs and metrics from the -temporary networks used for e2e testing by: +tmpnet supports comprehensive monitoring of temporary networks through log and metric collection. +This can be enabled for e2e tests in two ways: - - Supplying `--start-metrics-collector` and `--start-logs-collector` - as arguments to the test suite - - Starting collectors in advance of a test run with `tmpnetctl - start-metrics-collector` and ` tmpnetctl start-logs-collector` +- Supply `--start-metrics-collector` and `--start-logs-collector` as arguments to the test suite +- Start collectors manually before test runs using `tmpnetctl` -Both methods require: - - - Auth credentials to be supplied as env vars: - - `PROMETHEUS_USERNAME` - - `PROMETHEUS_PASSWORD` - - `LOKI_USERNAME` - - `LOKI_PASSWORD` - - The availability in the path of binaries for promtail and prometheus - - Starting a development shell with `nix develop` is one way to - ensure this and requires the installation of nix - (e.g. `./scripts/run_task.sh install-nix`). - -Once started, the collectors will continue to run in the background -until stopped by `tmpnetctl stop-metrics-collector` and `tmpnetctl stop-logs-collector`. - -The results of collection will be viewable at -https://grafana-poc.avax-dev.network. - -For more detail, see the [tmpnet docs](../fixture/tmpnet/README.md##monitoring). +For detailed configuration and usage, see the [tmpnet monitoring documentation](../fixture/tmpnet/README.md#monitoring). diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index 416c2c1c9f9d..f4f32e888c55 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -16,7 +16,29 @@ orchestrate the same temporary networks without the use of an rpc daemon. - [Simplifying usage with direnv](#simplifying-usage-with-direnv) - [Deprecated usage with e2e suite](#deprecated-usage-with-e2e-suite) - [Via code](#via-code) -- [Networking configuration](#networking-configuration) +- [Runtime Backends](#runtime-backends) + - [Process Runtime](#process-runtime) + - [Overview](#process-runtime-overview) + - [Requirements](#process-runtime-requirements) + - [Configuration](#process-runtime-configuration) + - [Networking](#process-runtime-networking) + - [Storage](#process-runtime-storage) + - [Monitoring](#process-runtime-monitoring) + - [Examples](#process-runtime-examples) + - [Kubernetes Runtime](#kubernetes-runtime) + - [Overview](#kubernetes-runtime-overview) + - [Requirements](#kubernetes-runtime-requirements) + - [Configuration](#kubernetes-runtime-configuration) + - [Networking](#kubernetes-runtime-networking) + - [Storage](#kubernetes-runtime-storage) + - [Monitoring](#kubernetes-runtime-monitoring) + - [Examples](#kubernetes-runtime-examples) +- [Configuration Flags](#configuration-flags) + - [Common Flags](#common-flags) + - [Process Runtime Flags](#process-runtime-flags) + - [Kubernetes Runtime Flags](#kubernetes-runtime-flags) + - [Monitoring Flags](#monitoring-flags) + - [Network Control Flags](#network-control-flags) - [Configuration on disk](#configuration-on-disk) - [Common networking configuration](#common-networking-configuration) - [Genesis](#genesis) @@ -56,35 +78,39 @@ repositories. The functionality in this package is grouped by logical purpose into the following non-test files: -| Filename | Types | Purpose | -|:----------------------------|:---------------|:-----------------------------------------------------------------------| -| flags/ | | Directory defining flags usable with both stdlib flags and spf13/pflag | -| flags/collector.go | | Defines flags configuring collection of logs and metrics | -| flags/common.go | | Defines type definitions common across other files | -| flags/process_runtime.go | | Defines flags configuring the process node runtime | -| flags/runtime.go | | Defines flags configuring node runtime | -| flags/start_network.go | | Defines flags configuring network start | -| tmpnetctl/ | | Directory containing main entrypoint for tmpnetctl command | -| yaml/ | | Directory defining kubernetes resources in yaml format | -| check_monitoring.go | | Enables checking if logs and metrics were collected | -| defaults.go | | Defines common default configuration | -| detached_process_default.go | | Configures detached processes for darwin and linux | -| detached_process_windows.go | | No-op detached process configuration for windows | -| flagsmap.go | FlagsMap | Simplifies configuration of avalanchego flags | -| genesis.go | | Creates test genesis | -| kube.go | | Library for Kubernetes interaction | -| local_network.go | | Defines configuration for the default local network | -| monitor_kube.go | | Enables collection of logs and metrics from kube pods | -| monitor_processes.go | | Enables collection of logs and metrics from local processes | -| network.go | Network | Orchestrates and configures temporary networks | -| network_config.go | Network | Reads and writes network configuration | -| network_test.go | | Simple test round-tripping Network serialization | -| node.go | Node | Orchestrates and configures nodes | -| node_config.go | Node | Reads and writes node configuration | -| process_runtime.go | ProcessRuntime | Orchestrates node processes | -| start_kind_cluster.go | | Starts a local kind cluster | -| subnet.go | Subnet | Orchestrates subnets | -| utils.go | | Defines shared utility functions | +| Filename | Types | Purpose | +|:----------------------------|:--------------------|:-----------------------------------------------------------------------| +| flags/ | | Directory defining flags usable with both stdlib flags and spf13/pflag | +| flags/collector.go | | Defines flags configuring collection of logs and metrics | +| flags/common.go | | Defines type definitions common across flag files | +| flags/flag_vars.go | FlagVars | Central flag management struct with validation and getters | +| flags/kube_runtime.go | | Defines flags configuring the Kubernetes node runtime | +| flags/kubeconfig.go | | Defines flags for Kubernetes cluster authentication | +| flags/process_runtime.go | | Defines flags configuring the process node runtime | +| flags/runtime.go | | Defines flags for runtime selection (process vs Kubernetes) | +| flags/start_network.go | | Defines flags configuring network start | +| tmpnetctl/ | | Directory containing main entrypoint for tmpnetctl command | +| yaml/ | | Directory defining kubernetes resources in yaml format | +| check_monitoring.go | | Enables checking if logs and metrics were collected | +| defaults.go | | Defines common default configuration | +| detached_process_default.go | | Configures detached processes for darwin and linux | +| detached_process_windows.go | | No-op detached process configuration for windows | +| flagsmap.go | FlagsMap | Simplifies configuration of avalanchego flags | +| genesis.go | | Creates test genesis | +| kube.go | | Library for Kubernetes interaction | +| kube_runtime.go | KubeRuntime | Orchestrates nodes running in Kubernetes | +| local_network.go | | Defines configuration for the default local network | +| monitor_kube.go | | Enables collection of logs and metrics from kube pods | +| monitor_processes.go | | Enables collection of logs and metrics from local processes | +| network.go | Network | Orchestrates and configures temporary networks | +| network_config.go | Network | Reads and writes network configuration | +| network_test.go | | Simple test round-tripping Network serialization | +| node.go | Node | Orchestrates and configures nodes | +| node_config.go | Node | Reads and writes node configuration | +| process_runtime.go | ProcessRuntime | Orchestrates nodes as local processes | +| start_kind_cluster.go | | Starts a local kind cluster for Kubernetes testing | +| subnet.go | Subnet | Orchestrates subnets | +| utils.go | | Defines shared utility functions | ## Usage @@ -188,16 +214,366 @@ uris := network.GetNodeURIs() network.Stop(context.Background()) ``` -## Networking configuration +## Runtime Backends [Top](#table-of-contents) -By default, nodes in a temporary network will be started with staking and -API ports set to `0` to ensure that ports will be dynamically -chosen. The tmpnet fixture discovers the ports used by a given node -by reading the `[base-data-dir]/process.json` file written by -avalanchego on node start. The use of dynamic ports supports testing -with many temporary networks without having to manually select compatible -port ranges. +tmpnet supports two runtime backends for running avalanchego nodes: + +- **Process Runtime**: Runs nodes as local processes on the host machine. This is the default runtime and is ideal for local development and testing. +- **Kubernetes Runtime**: Runs nodes as pods in a Kubernetes cluster. This runtime enables testing at scale and closer simulation of production environments. + +The runtime can be selected via the `--runtime` flag when using `tmpnetctl` or by configuring the appropriate runtime in code. Both runtimes support the same core functionality but differ in their deployment characteristics, resource management, and networking approaches. + +### Process Runtime +[Top](#table-of-contents) + +#### Overview {#process-runtime-overview} + +The process runtime executes avalanchego nodes as separate processes on the local machine. Each node runs in its own process with its own data directory, ports, and configuration. This runtime is the simplest to use and requires no additional infrastructure beyond the local machine. + +#### Requirements {#process-runtime-requirements} + +- **avalanchego binary**: A compiled avalanchego binary must be available locally +- **Plugin directory**: VM plugins must be available in a local directory (typically `~/.avalanchego/plugins`) +- **File system permissions**: Write access to the tmpnet root directory (default: `~/.tmpnet`) +- **Available ports**: Sufficient free ports for nodes (uses dynamic allocation by default) +- **Operating System**: Linux, macOS, or Windows (with limitations) + +#### Configuration {#process-runtime-configuration} + +Process runtime nodes can be configured through: + +1. **Command-line flags**: + ```bash + tmpnetctl start-network --avalanchego-path=/path/to/avalanchego --plugin-dir=/path/to/plugins + ``` + +2. **Environment variables**: + ```bash + export AVALANCHEGO_PATH=/path/to/avalanchego + export AVALANCHEGO_PLUGIN_DIR=/path/to/plugins + tmpnetctl start-network + ``` + +3. **In code**: + ```go + network := &tmpnet.Network{ + DefaultRuntimeConfig: tmpnet.NodeRuntimeConfig{ + Process: &tmpnet.ProcessRuntimeConfig{ + AvalanchegoPath: "/path/to/avalanchego", + PluginDir: "/path/to/plugins", + ReuseDynamicPorts: true, + }, + }, + } + ``` + +Key configuration options: +- `AvalanchegoPath`: Path to the avalanchego binary +- `PluginDir`: Directory containing VM plugins +- `ReuseDynamicPorts`: Whether to reuse ports when restarting nodes +- `RedirectStdout`: Redirect node stdout to a file +- `RedirectStderr`: Redirect node stderr to a file + +#### Networking {#process-runtime-networking} + +Process runtime nodes use local networking: + +- **Dynamic port allocation**: By default, nodes use port 0 for both staking and API ports, allowing the OS to assign available ports +- **Port discovery**: Actual ports are discovered by reading the `process.json` file written by avalanchego on startup +- **Direct connectivity**: All nodes can communicate directly via localhost +- **No ingress required**: External access is direct to node ports + +#### Storage {#process-runtime-storage} + +Each node's data is stored in a dedicated directory: + +``` +~/.tmpnet/networks/[network-id]/[node-id]/ +├── chainData/ # Blockchain data +├── db/ # Database files +├── logs/ # Node logs +├── plugins/ # VM binaries (if configured) +├── config.json # Node runtime configuration +├── flags.json # Node flags +└── process.json # Process details (PID, ports) +``` + +#### Monitoring {#process-runtime-monitoring} + +Process runtime supports log and metric collection: + +- **Logs**: Written to `[node-dir]/logs/` and can be collected by promtail +- **Metrics**: Exposed on the node's API port at `/ext/metrics` +- **File-based discovery**: Prometheus/Promtail configuration is written to `~/.tmpnet/[prometheus|promtail]/file_sd_configs/` + +#### Examples {#process-runtime-examples} + +**Basic network start**: +```bash +# Start a 5-node network +tmpnetctl start-network --node-count=5 --avalanchego-path=/path/to/avalanchego +``` + +**Network with custom VM**: +```bash +# Ensure plugin is available +cp myvm ~/.avalanchego/plugins/ + +# Start network (in code) +network := &tmpnet.Network{ + Subnets: []*tmpnet.Subnet{{ + Name: "my-subnet", + Chains: []*tmpnet.Chain{{ + VMName: "myvm", + Genesis: genesisBytes, + }}, + }}, +} +``` + +### Kubernetes Runtime +[Top](#table-of-contents) + +#### Overview {#kubernetes-runtime-overview} + +The Kubernetes runtime deploys avalanchego nodes as StatefulSets in a Kubernetes cluster. Each node runs in its own pod with persistent storage, service discovery, and optional ingress for external access. This runtime enables testing at scale and provides better resource isolation. + +#### Requirements {#kubernetes-runtime-requirements} + +- **Kubernetes cluster**: A running Kubernetes cluster (1.19+) +- **kubectl access**: Configured kubeconfig with appropriate permissions +- **Storage provisioner**: Dynamic PersistentVolume provisioner (or pre-provisioned PVs) +- **Ingress controller** (optional): For external access (e.g., nginx-ingress) +- **Container image**: avalanchego container image accessible to the cluster + +For local development, you can use: +- **kind** (Kubernetes in Docker): `tmpnetctl start-kind-cluster` +- **minikube**: Standard minikube setup +- **Docker Desktop**: Built-in Kubernetes + +For production testing: +- **EKS, GKE, AKS**: Cloud-managed Kubernetes +- **Self-managed**: Any conformant Kubernetes cluster + +#### Configuration {#kubernetes-runtime-configuration} + +Kubernetes runtime configuration: + +1. **Command-line flags**: + ```bash + tmpnetctl start-network \ + --runtime=kubernetes \ + --kube-config-path=$HOME/.kube/config \ + --kube-namespace=avalanche-testing \ + --kube-image=avaplatform/avalanchego:latest + ``` + +2. **Environment variables**: + ```bash + export TMPNET_RUNTIME=kubernetes + export KUBE_CONFIG_PATH=$HOME/.kube/config + export KUBE_NAMESPACE=avalanche-testing + tmpnetctl start-network + ``` + +3. **In code**: + ```go + network := &tmpnet.Network{ + DefaultRuntimeConfig: tmpnet.NodeRuntimeConfig{ + Kube: &tmpnet.KubeRuntimeConfig{ + ConfigPath: os.ExpandEnv("$HOME/.kube/config"), + Namespace: "avalanche-testing", + Image: "avaplatform/avalanchego:latest", + VolumeSizeGB: 10, + UseExclusiveScheduling: true, + SchedulingLabelKey: "avalanche-node", + SchedulingLabelValue: "dedicated", + }, + }, + } + ``` + +Key configuration options: +- `ConfigPath`: Path to kubeconfig file +- `ConfigContext`: Kubeconfig context to use +- `Namespace`: Kubernetes namespace for resources +- `Image`: Container image for avalanchego +- `VolumeSizeGB`: Size of PersistentVolumeClaim (minimum 2GB) +- `UseExclusiveScheduling`: Enable dedicated node scheduling +- `IngressHost`: Hostname for ingress rules +- `IngressSecret`: TLS secret for HTTPS ingress + +#### Networking {#kubernetes-runtime-networking} + +Kubernetes runtime networking differs based on where tmpnet is running: + +**When running inside the cluster**: +- Direct pod-to-pod communication via cluster networking +- No ingress required +- Uses internal service discovery + +**When running outside the cluster**: +- Requires ingress configuration for API access +- Uses port forwarding for staking port access +- Ingress paths: `/networks/[network-uuid]/[node-id]` + +**Ingress configuration**: +```yaml +# Create ConfigMap for ingress settings +apiVersion: v1 +kind: ConfigMap +metadata: + name: tmpnet-ingress-config + namespace: avalanche-testing +data: + host: "tmpnet.example.com" + secret: "tmpnet-tls" # Optional, for HTTPS +``` + +#### Storage {#kubernetes-runtime-storage} + +Each node uses a PersistentVolumeClaim: + +- **Minimum size**: 2GB (nodes report unhealthy below 1GB free) +- **Storage class**: Uses cluster default or can be specified +- **Mount path**: `/data` within the container +- **Persistence**: Data survives pod restarts + +Example PVC: +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: [network-uuid]-[node-id-prefix]-0 +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi +``` + +#### Monitoring {#kubernetes-runtime-monitoring} + +Kubernetes runtime monitoring integration: + +- **Metrics**: Scraped via Prometheus ServiceMonitor or pod annotations +- **Logs**: Collected via promtail DaemonSet +- **Labels**: Includes standard Kubernetes labels plus tmpnet-specific labels +- **Service discovery**: Automatic via Kubernetes APIs + +#### Examples {#kubernetes-runtime-examples} + +**Local testing with kind**: +```bash +# Start a kind cluster +tmpnetctl start-kind-cluster + +# Start network in kind +tmpnetctl start-network \ + --runtime=kubernetes \ + --kube-namespace=avalanche-testing \ + --kube-image=avaplatform/avalanchego:latest +``` + +**Production-like testing with exclusive scheduling**: +```bash +# Label dedicated nodes +kubectl label nodes worker-1 worker-2 worker-3 avalanche-node=dedicated +kubectl taint nodes worker-1 worker-2 worker-3 avalanche-node=dedicated:NoExecute + +# Start network with exclusive scheduling +tmpnetctl start-network \ + --runtime=kubernetes \ + --kube-use-exclusive-scheduling \ + --kube-scheduling-label-key=avalanche-node \ + --kube-scheduling-label-value=dedicated +``` + +**External access configuration**: +```bash +# Create ingress config +kubectl create configmap tmpnet-ingress-config \ + --from-literal=host=tmpnet.example.com \ + --from-literal=secret=tmpnet-tls + +# Start network (will auto-detect ingress config) +tmpnetctl start-network --runtime=kubernetes +``` + +## Configuration Flags +[Top](#table-of-contents) + +tmpnet provides a comprehensive set of flags for configuring networks and nodes. Flags can be set via command line, environment variables, or in code. + +### Common Flags +[Top](#table-of-contents) + +These flags apply regardless of runtime: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--network-dir` | `TMPNET_NETWORK_DIR` | | Path to the network directory | +| `--root-network-dir` | `TMPNET_ROOT_NETWORK_DIR` | `~/.tmpnet/networks` | Root directory for storing networks | +| `--network-owner` | `TMPNET_NETWORK_OWNER` | | Identifier for the network owner (for monitoring) | +| `--node-count` | | 2 | Number of nodes to create in the network | +| `--log-level` | | INFO | Default log level for nodes | + +### Process Runtime Flags +[Top](#table-of-contents) + +Flags specific to process runtime: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--avalanchego-path` | `AVALANCHEGO_PATH` | | Path to avalanchego binary | +| `--plugin-dir` | `AVALANCHEGO_PLUGIN_DIR` | `~/.avalanchego/plugins` | Directory containing VM plugins | +| `--reuse-dynamic-ports` | | false | Reuse ports when restarting nodes | +| `--redirect-stdout` | | false | Redirect node stdout to file | +| `--redirect-stderr` | | false | Redirect node stderr to file | + +### Kubernetes Runtime Flags +[Top](#table-of-contents) + +Flags specific to Kubernetes runtime: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--kube-config-path` | `KUBE_CONFIG_PATH` | `~/.kube/config` | Path to kubeconfig file | +| `--kube-config-context` | `KUBE_CONFIG_CONTEXT` | | Kubeconfig context to use | +| `--kube-namespace` | `KUBE_NAMESPACE` | `tmpnet` | Kubernetes namespace | +| `--kube-image` | `KUBE_IMAGE` | | Container image for nodes | +| `--kube-volume-size` | | 2 | Volume size in GB (minimum 2) | +| `--kube-use-exclusive-scheduling` | | false | Enable exclusive node scheduling | +| `--kube-scheduling-label-key` | | | Label key for node selection | +| `--kube-scheduling-label-value` | | | Label value for node selection | +| `--kube-ingress-host` | | | Hostname for ingress rules | +| `--kube-ingress-secret` | | | TLS secret for HTTPS ingress | + +### Monitoring Flags +[Top](#table-of-contents) + +Flags for configuring monitoring: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--start-metrics-collector` | | false | Start prometheus collector | +| `--start-logs-collector` | | false | Start promtail collector | +| `--stop-metrics-collector` | | false | Stop prometheus collector | +| `--stop-logs-collector` | | false | Stop promtail collector | + +### Network Control Flags +[Top](#table-of-contents) + +Flags for controlling network lifecycle: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--start-network` | | false | Start a new network | +| `--stop-network` | | false | Stop the network | +| `--restart-network` | | false | Restart network nodes | +| `--reuse-network` | | false | Reuse existing network | + ## Configuration on disk [Top](#table-of-contents)