diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e57e857af5bc..3bd3fdd615c8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -255,6 +255,7 @@ jobs: - uses: ./.github/actions/run-monitored-tmpnet-cmd with: run: ./scripts/run_task.sh test-load-kube + runtime: kube artifact_prefix: load-kube prometheus_username: ${{ secrets.PROMETHEUS_ID || '' }} prometheus_password: ${{ secrets.PROMETHEUS_PASSWORD || '' }} diff --git a/Taskfile.yml b/Taskfile.yml index e0a89f4612f3..21b2bf813ae7 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -203,7 +203,7 @@ tasks: env: E2E_SERIAL: 1 cmds: - - task: test-e2e-kube + - cmd: bash -x ./scripts/tests.e2e.kube.sh {{.CLI_ARGS}} # To use a different fuzz time, run `task test-fuzz FUZZTIME=[value in seconds]`. # A value of `-1` will run until it encounters a failing output. @@ -241,7 +241,7 @@ tasks: test-load2: desc: Runs second iteration of load tests - cmds: + cmds: - task: build - cmd: go run ./tests/load2/main --avalanchego-path=./build/avalanchego {{.CLI_ARGS}} diff --git a/flake.nix b/flake.nix index e4a2580520dd..1ee25aec4e68 100644 --- a/flake.nix +++ b/flake.nix @@ -48,7 +48,6 @@ k9s # Kubernetes TUI kind # Kubernetes-in-Docker kubernetes-helm # Helm CLI (Kubernetes package manager) - self.packages.${system}.kind-with-registry # Script installing kind configured with a local registry # Linters shellcheck @@ -64,32 +63,11 @@ # macOS-specific frameworks darwin.apple_sdk.frameworks.Security ]; - }; - }); - - # Package to install the kind-with-registry script - packages = forAllSystems ({ pkgs }: { - kind-with-registry = pkgs.stdenv.mkDerivation { - pname = "kind-with-registry"; - version = "1.0.0"; - src = pkgs.fetchurl { - url = "https://raw.githubusercontent.com/kubernetes-sigs/kind/7cb9e6be25b48a0e248097eef29d496ab1a044d0/site/static/examples/kind-with-registry.sh"; - sha256 = "0gri0x0ygcwmz8l4h6zzsvydw8rsh7qa8p5218d4hncm363i81hv"; - }; - - phases = [ "installPhase" ]; - - installPhase = '' - mkdir -p $out/bin - install -m755 $src $out/bin/kind-with-registry.sh + # Add scripts/ directory to PATH so kind-with-registry.sh is accessible + shellHook = '' + export PATH="$PWD/scripts:$PATH" ''; - - meta = with pkgs.lib; { - description = "Script to set up kind with a local registry"; - license = licenses.mit; - maintainers = with maintainers; [ "maru-ava" ]; - }; }; }); }; diff --git a/scripts/kind-with-registry.sh b/scripts/kind-with-registry.sh new file mode 100755 index 000000000000..5027432722eb --- /dev/null +++ b/scripts/kind-with-registry.sh @@ -0,0 +1,90 @@ +#!/bin/sh +# Based on https://raw.githubusercontent.com/kubernetes-sigs/kind/7cb9e6be25b48a0e248097eef29d496ab1a044d0/site/static/examples/kind-with-registry.sh +# Original work Copyright 2019 The Kubernetes Authors +# Modifications Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +# See the file LICENSE for licensing terms. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO(marun) Migrate this script to golang +set -o errexit + +# 1. Create registry container unless it already exists +reg_name='kind-registry' +reg_port='5001' +if [ "$(docker inspect -f '{{.State.Running}}' "${reg_name}" 2>/dev/null || true)" != 'true' ]; then + docker run \ + -d --restart=always -p "127.0.0.1:${reg_port}:5000" --network bridge --name "${reg_name}" \ + registry:2 +fi + +# 2. Create kind cluster with containerd registry config dir enabled +# TODO: kind will eventually enable this by default and this patch will +# be unnecessary. +# +# See: +# https://github.com/kubernetes-sigs/kind/issues/2875 +# https://github.com/containerd/containerd/blob/main/docs/cri/config.md#registry-configuration +# See: https://github.com/containerd/containerd/blob/main/docs/hosts.md +cat < 0 { - tc.Log().Info("delaying network shutdown to ensure final metrics scrape", - zap.Duration("delay", shutdownDelay), - ) - time.Sleep(shutdownDelay) - } - - tc.Log().Info("shutting down network") - ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) - defer cancel() - require.NoError(network.Stop(ctx)) - }) -} - // NewPChainFeeCalculatorFromContext returns either a static or dynamic fee // calculator depending on the provided context. func NewPChainFeeCalculatorFromContext(context *builder.Context) fee.Calculator { @@ -377,26 +306,3 @@ func GetRepoRootPath(suffix string) (string, error) { } return strings.TrimSuffix(cwd, suffix), nil } - -// GetLocalURI retrieves the locally-accessible URI of the provided node. When a node -// is running as a local process, this will be the URI exposed by the node. For a -// node running remotely in kube, the URI will be a local address whose port is -// forwarded to the node's URI through the kube API server. -func GetLocalURI(tc tests.TestContext, node *tmpnet.Node) string { - uri, cancel, err := node.GetLocalURI(tc.DefaultContext()) - require.NoError(tc, err) - tc.DeferCleanup(cancel) - return uri -} - -// GetLocalStakingAddress retrieves the locally-accessible staking address of the -// provided node. When a node is a local process, this will be the staking address -// exposed by the node. For a node running remotely in kube, the staking address will -// be a local address whose port will be forwarded to the node's staking address -// through the kube API server. -func GetLocalStakingAddress(tc tests.TestContext, node *tmpnet.Node) netip.AddrPort { - stakingAddress, cancel, err := node.GetLocalStakingAddress(tc.DefaultContext()) - require.NoError(tc, err) - tc.DeferCleanup(cancel) - return stakingAddress -} diff --git a/tests/fixture/e2e/metrics_link.go b/tests/fixture/e2e/metrics_link.go index c7a164df0130..7d46b537a383 100644 --- a/tests/fixture/e2e/metrics_link.go +++ b/tests/fixture/e2e/metrics_link.go @@ -11,6 +11,7 @@ import ( "go.uber.org/zap" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/testenv" ) // The ginkgo event handlers defined in this file will be automatically @@ -32,7 +33,7 @@ var _ = ginkgo.BeforeEach(func() { // of the current spec. var _ = ginkgo.AfterEach(func() { tc := NewTestContext() - env := GetEnv(tc) + env := testenv.GetEnv(tc) // The global env isn't guaranteed to be initialized by importers // of this package since initializing a package-local env is also // supported. diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index 416c2c1c9f9d..f4f32e888c55 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -16,7 +16,29 @@ orchestrate the same temporary networks without the use of an rpc daemon. - [Simplifying usage with direnv](#simplifying-usage-with-direnv) - [Deprecated usage with e2e suite](#deprecated-usage-with-e2e-suite) - [Via code](#via-code) -- [Networking configuration](#networking-configuration) +- [Runtime Backends](#runtime-backends) + - [Process Runtime](#process-runtime) + - [Overview](#process-runtime-overview) + - [Requirements](#process-runtime-requirements) + - [Configuration](#process-runtime-configuration) + - [Networking](#process-runtime-networking) + - [Storage](#process-runtime-storage) + - [Monitoring](#process-runtime-monitoring) + - [Examples](#process-runtime-examples) + - [Kubernetes Runtime](#kubernetes-runtime) + - [Overview](#kubernetes-runtime-overview) + - [Requirements](#kubernetes-runtime-requirements) + - [Configuration](#kubernetes-runtime-configuration) + - [Networking](#kubernetes-runtime-networking) + - [Storage](#kubernetes-runtime-storage) + - [Monitoring](#kubernetes-runtime-monitoring) + - [Examples](#kubernetes-runtime-examples) +- [Configuration Flags](#configuration-flags) + - [Common Flags](#common-flags) + - [Process Runtime Flags](#process-runtime-flags) + - [Kubernetes Runtime Flags](#kubernetes-runtime-flags) + - [Monitoring Flags](#monitoring-flags) + - [Network Control Flags](#network-control-flags) - [Configuration on disk](#configuration-on-disk) - [Common networking configuration](#common-networking-configuration) - [Genesis](#genesis) @@ -56,35 +78,39 @@ repositories. The functionality in this package is grouped by logical purpose into the following non-test files: -| Filename | Types | Purpose | -|:----------------------------|:---------------|:-----------------------------------------------------------------------| -| flags/ | | Directory defining flags usable with both stdlib flags and spf13/pflag | -| flags/collector.go | | Defines flags configuring collection of logs and metrics | -| flags/common.go | | Defines type definitions common across other files | -| flags/process_runtime.go | | Defines flags configuring the process node runtime | -| flags/runtime.go | | Defines flags configuring node runtime | -| flags/start_network.go | | Defines flags configuring network start | -| tmpnetctl/ | | Directory containing main entrypoint for tmpnetctl command | -| yaml/ | | Directory defining kubernetes resources in yaml format | -| check_monitoring.go | | Enables checking if logs and metrics were collected | -| defaults.go | | Defines common default configuration | -| detached_process_default.go | | Configures detached processes for darwin and linux | -| detached_process_windows.go | | No-op detached process configuration for windows | -| flagsmap.go | FlagsMap | Simplifies configuration of avalanchego flags | -| genesis.go | | Creates test genesis | -| kube.go | | Library for Kubernetes interaction | -| local_network.go | | Defines configuration for the default local network | -| monitor_kube.go | | Enables collection of logs and metrics from kube pods | -| monitor_processes.go | | Enables collection of logs and metrics from local processes | -| network.go | Network | Orchestrates and configures temporary networks | -| network_config.go | Network | Reads and writes network configuration | -| network_test.go | | Simple test round-tripping Network serialization | -| node.go | Node | Orchestrates and configures nodes | -| node_config.go | Node | Reads and writes node configuration | -| process_runtime.go | ProcessRuntime | Orchestrates node processes | -| start_kind_cluster.go | | Starts a local kind cluster | -| subnet.go | Subnet | Orchestrates subnets | -| utils.go | | Defines shared utility functions | +| Filename | Types | Purpose | +|:----------------------------|:--------------------|:-----------------------------------------------------------------------| +| flags/ | | Directory defining flags usable with both stdlib flags and spf13/pflag | +| flags/collector.go | | Defines flags configuring collection of logs and metrics | +| flags/common.go | | Defines type definitions common across flag files | +| flags/flag_vars.go | FlagVars | Central flag management struct with validation and getters | +| flags/kube_runtime.go | | Defines flags configuring the Kubernetes node runtime | +| flags/kubeconfig.go | | Defines flags for Kubernetes cluster authentication | +| flags/process_runtime.go | | Defines flags configuring the process node runtime | +| flags/runtime.go | | Defines flags for runtime selection (process vs Kubernetes) | +| flags/start_network.go | | Defines flags configuring network start | +| tmpnetctl/ | | Directory containing main entrypoint for tmpnetctl command | +| yaml/ | | Directory defining kubernetes resources in yaml format | +| check_monitoring.go | | Enables checking if logs and metrics were collected | +| defaults.go | | Defines common default configuration | +| detached_process_default.go | | Configures detached processes for darwin and linux | +| detached_process_windows.go | | No-op detached process configuration for windows | +| flagsmap.go | FlagsMap | Simplifies configuration of avalanchego flags | +| genesis.go | | Creates test genesis | +| kube.go | | Library for Kubernetes interaction | +| kube_runtime.go | KubeRuntime | Orchestrates nodes running in Kubernetes | +| local_network.go | | Defines configuration for the default local network | +| monitor_kube.go | | Enables collection of logs and metrics from kube pods | +| monitor_processes.go | | Enables collection of logs and metrics from local processes | +| network.go | Network | Orchestrates and configures temporary networks | +| network_config.go | Network | Reads and writes network configuration | +| network_test.go | | Simple test round-tripping Network serialization | +| node.go | Node | Orchestrates and configures nodes | +| node_config.go | Node | Reads and writes node configuration | +| process_runtime.go | ProcessRuntime | Orchestrates nodes as local processes | +| start_kind_cluster.go | | Starts a local kind cluster for Kubernetes testing | +| subnet.go | Subnet | Orchestrates subnets | +| utils.go | | Defines shared utility functions | ## Usage @@ -188,16 +214,366 @@ uris := network.GetNodeURIs() network.Stop(context.Background()) ``` -## Networking configuration +## Runtime Backends [Top](#table-of-contents) -By default, nodes in a temporary network will be started with staking and -API ports set to `0` to ensure that ports will be dynamically -chosen. The tmpnet fixture discovers the ports used by a given node -by reading the `[base-data-dir]/process.json` file written by -avalanchego on node start. The use of dynamic ports supports testing -with many temporary networks without having to manually select compatible -port ranges. +tmpnet supports two runtime backends for running avalanchego nodes: + +- **Process Runtime**: Runs nodes as local processes on the host machine. This is the default runtime and is ideal for local development and testing. +- **Kubernetes Runtime**: Runs nodes as pods in a Kubernetes cluster. This runtime enables testing at scale and closer simulation of production environments. + +The runtime can be selected via the `--runtime` flag when using `tmpnetctl` or by configuring the appropriate runtime in code. Both runtimes support the same core functionality but differ in their deployment characteristics, resource management, and networking approaches. + +### Process Runtime +[Top](#table-of-contents) + +#### Overview {#process-runtime-overview} + +The process runtime executes avalanchego nodes as separate processes on the local machine. Each node runs in its own process with its own data directory, ports, and configuration. This runtime is the simplest to use and requires no additional infrastructure beyond the local machine. + +#### Requirements {#process-runtime-requirements} + +- **avalanchego binary**: A compiled avalanchego binary must be available locally +- **Plugin directory**: VM plugins must be available in a local directory (typically `~/.avalanchego/plugins`) +- **File system permissions**: Write access to the tmpnet root directory (default: `~/.tmpnet`) +- **Available ports**: Sufficient free ports for nodes (uses dynamic allocation by default) +- **Operating System**: Linux, macOS, or Windows (with limitations) + +#### Configuration {#process-runtime-configuration} + +Process runtime nodes can be configured through: + +1. **Command-line flags**: + ```bash + tmpnetctl start-network --avalanchego-path=/path/to/avalanchego --plugin-dir=/path/to/plugins + ``` + +2. **Environment variables**: + ```bash + export AVALANCHEGO_PATH=/path/to/avalanchego + export AVALANCHEGO_PLUGIN_DIR=/path/to/plugins + tmpnetctl start-network + ``` + +3. **In code**: + ```go + network := &tmpnet.Network{ + DefaultRuntimeConfig: tmpnet.NodeRuntimeConfig{ + Process: &tmpnet.ProcessRuntimeConfig{ + AvalanchegoPath: "/path/to/avalanchego", + PluginDir: "/path/to/plugins", + ReuseDynamicPorts: true, + }, + }, + } + ``` + +Key configuration options: +- `AvalanchegoPath`: Path to the avalanchego binary +- `PluginDir`: Directory containing VM plugins +- `ReuseDynamicPorts`: Whether to reuse ports when restarting nodes +- `RedirectStdout`: Redirect node stdout to a file +- `RedirectStderr`: Redirect node stderr to a file + +#### Networking {#process-runtime-networking} + +Process runtime nodes use local networking: + +- **Dynamic port allocation**: By default, nodes use port 0 for both staking and API ports, allowing the OS to assign available ports +- **Port discovery**: Actual ports are discovered by reading the `process.json` file written by avalanchego on startup +- **Direct connectivity**: All nodes can communicate directly via localhost +- **No ingress required**: External access is direct to node ports + +#### Storage {#process-runtime-storage} + +Each node's data is stored in a dedicated directory: + +``` +~/.tmpnet/networks/[network-id]/[node-id]/ +├── chainData/ # Blockchain data +├── db/ # Database files +├── logs/ # Node logs +├── plugins/ # VM binaries (if configured) +├── config.json # Node runtime configuration +├── flags.json # Node flags +└── process.json # Process details (PID, ports) +``` + +#### Monitoring {#process-runtime-monitoring} + +Process runtime supports log and metric collection: + +- **Logs**: Written to `[node-dir]/logs/` and can be collected by promtail +- **Metrics**: Exposed on the node's API port at `/ext/metrics` +- **File-based discovery**: Prometheus/Promtail configuration is written to `~/.tmpnet/[prometheus|promtail]/file_sd_configs/` + +#### Examples {#process-runtime-examples} + +**Basic network start**: +```bash +# Start a 5-node network +tmpnetctl start-network --node-count=5 --avalanchego-path=/path/to/avalanchego +``` + +**Network with custom VM**: +```bash +# Ensure plugin is available +cp myvm ~/.avalanchego/plugins/ + +# Start network (in code) +network := &tmpnet.Network{ + Subnets: []*tmpnet.Subnet{{ + Name: "my-subnet", + Chains: []*tmpnet.Chain{{ + VMName: "myvm", + Genesis: genesisBytes, + }}, + }}, +} +``` + +### Kubernetes Runtime +[Top](#table-of-contents) + +#### Overview {#kubernetes-runtime-overview} + +The Kubernetes runtime deploys avalanchego nodes as StatefulSets in a Kubernetes cluster. Each node runs in its own pod with persistent storage, service discovery, and optional ingress for external access. This runtime enables testing at scale and provides better resource isolation. + +#### Requirements {#kubernetes-runtime-requirements} + +- **Kubernetes cluster**: A running Kubernetes cluster (1.19+) +- **kubectl access**: Configured kubeconfig with appropriate permissions +- **Storage provisioner**: Dynamic PersistentVolume provisioner (or pre-provisioned PVs) +- **Ingress controller** (optional): For external access (e.g., nginx-ingress) +- **Container image**: avalanchego container image accessible to the cluster + +For local development, you can use: +- **kind** (Kubernetes in Docker): `tmpnetctl start-kind-cluster` +- **minikube**: Standard minikube setup +- **Docker Desktop**: Built-in Kubernetes + +For production testing: +- **EKS, GKE, AKS**: Cloud-managed Kubernetes +- **Self-managed**: Any conformant Kubernetes cluster + +#### Configuration {#kubernetes-runtime-configuration} + +Kubernetes runtime configuration: + +1. **Command-line flags**: + ```bash + tmpnetctl start-network \ + --runtime=kubernetes \ + --kube-config-path=$HOME/.kube/config \ + --kube-namespace=avalanche-testing \ + --kube-image=avaplatform/avalanchego:latest + ``` + +2. **Environment variables**: + ```bash + export TMPNET_RUNTIME=kubernetes + export KUBE_CONFIG_PATH=$HOME/.kube/config + export KUBE_NAMESPACE=avalanche-testing + tmpnetctl start-network + ``` + +3. **In code**: + ```go + network := &tmpnet.Network{ + DefaultRuntimeConfig: tmpnet.NodeRuntimeConfig{ + Kube: &tmpnet.KubeRuntimeConfig{ + ConfigPath: os.ExpandEnv("$HOME/.kube/config"), + Namespace: "avalanche-testing", + Image: "avaplatform/avalanchego:latest", + VolumeSizeGB: 10, + UseExclusiveScheduling: true, + SchedulingLabelKey: "avalanche-node", + SchedulingLabelValue: "dedicated", + }, + }, + } + ``` + +Key configuration options: +- `ConfigPath`: Path to kubeconfig file +- `ConfigContext`: Kubeconfig context to use +- `Namespace`: Kubernetes namespace for resources +- `Image`: Container image for avalanchego +- `VolumeSizeGB`: Size of PersistentVolumeClaim (minimum 2GB) +- `UseExclusiveScheduling`: Enable dedicated node scheduling +- `IngressHost`: Hostname for ingress rules +- `IngressSecret`: TLS secret for HTTPS ingress + +#### Networking {#kubernetes-runtime-networking} + +Kubernetes runtime networking differs based on where tmpnet is running: + +**When running inside the cluster**: +- Direct pod-to-pod communication via cluster networking +- No ingress required +- Uses internal service discovery + +**When running outside the cluster**: +- Requires ingress configuration for API access +- Uses port forwarding for staking port access +- Ingress paths: `/networks/[network-uuid]/[node-id]` + +**Ingress configuration**: +```yaml +# Create ConfigMap for ingress settings +apiVersion: v1 +kind: ConfigMap +metadata: + name: tmpnet-ingress-config + namespace: avalanche-testing +data: + host: "tmpnet.example.com" + secret: "tmpnet-tls" # Optional, for HTTPS +``` + +#### Storage {#kubernetes-runtime-storage} + +Each node uses a PersistentVolumeClaim: + +- **Minimum size**: 2GB (nodes report unhealthy below 1GB free) +- **Storage class**: Uses cluster default or can be specified +- **Mount path**: `/data` within the container +- **Persistence**: Data survives pod restarts + +Example PVC: +```yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: [network-uuid]-[node-id-prefix]-0 +spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi +``` + +#### Monitoring {#kubernetes-runtime-monitoring} + +Kubernetes runtime monitoring integration: + +- **Metrics**: Scraped via Prometheus ServiceMonitor or pod annotations +- **Logs**: Collected via promtail DaemonSet +- **Labels**: Includes standard Kubernetes labels plus tmpnet-specific labels +- **Service discovery**: Automatic via Kubernetes APIs + +#### Examples {#kubernetes-runtime-examples} + +**Local testing with kind**: +```bash +# Start a kind cluster +tmpnetctl start-kind-cluster + +# Start network in kind +tmpnetctl start-network \ + --runtime=kubernetes \ + --kube-namespace=avalanche-testing \ + --kube-image=avaplatform/avalanchego:latest +``` + +**Production-like testing with exclusive scheduling**: +```bash +# Label dedicated nodes +kubectl label nodes worker-1 worker-2 worker-3 avalanche-node=dedicated +kubectl taint nodes worker-1 worker-2 worker-3 avalanche-node=dedicated:NoExecute + +# Start network with exclusive scheduling +tmpnetctl start-network \ + --runtime=kubernetes \ + --kube-use-exclusive-scheduling \ + --kube-scheduling-label-key=avalanche-node \ + --kube-scheduling-label-value=dedicated +``` + +**External access configuration**: +```bash +# Create ingress config +kubectl create configmap tmpnet-ingress-config \ + --from-literal=host=tmpnet.example.com \ + --from-literal=secret=tmpnet-tls + +# Start network (will auto-detect ingress config) +tmpnetctl start-network --runtime=kubernetes +``` + +## Configuration Flags +[Top](#table-of-contents) + +tmpnet provides a comprehensive set of flags for configuring networks and nodes. Flags can be set via command line, environment variables, or in code. + +### Common Flags +[Top](#table-of-contents) + +These flags apply regardless of runtime: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--network-dir` | `TMPNET_NETWORK_DIR` | | Path to the network directory | +| `--root-network-dir` | `TMPNET_ROOT_NETWORK_DIR` | `~/.tmpnet/networks` | Root directory for storing networks | +| `--network-owner` | `TMPNET_NETWORK_OWNER` | | Identifier for the network owner (for monitoring) | +| `--node-count` | | 2 | Number of nodes to create in the network | +| `--log-level` | | INFO | Default log level for nodes | + +### Process Runtime Flags +[Top](#table-of-contents) + +Flags specific to process runtime: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--avalanchego-path` | `AVALANCHEGO_PATH` | | Path to avalanchego binary | +| `--plugin-dir` | `AVALANCHEGO_PLUGIN_DIR` | `~/.avalanchego/plugins` | Directory containing VM plugins | +| `--reuse-dynamic-ports` | | false | Reuse ports when restarting nodes | +| `--redirect-stdout` | | false | Redirect node stdout to file | +| `--redirect-stderr` | | false | Redirect node stderr to file | + +### Kubernetes Runtime Flags +[Top](#table-of-contents) + +Flags specific to Kubernetes runtime: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--kube-config-path` | `KUBE_CONFIG_PATH` | `~/.kube/config` | Path to kubeconfig file | +| `--kube-config-context` | `KUBE_CONFIG_CONTEXT` | | Kubeconfig context to use | +| `--kube-namespace` | `KUBE_NAMESPACE` | `tmpnet` | Kubernetes namespace | +| `--kube-image` | `KUBE_IMAGE` | | Container image for nodes | +| `--kube-volume-size` | | 2 | Volume size in GB (minimum 2) | +| `--kube-use-exclusive-scheduling` | | false | Enable exclusive node scheduling | +| `--kube-scheduling-label-key` | | | Label key for node selection | +| `--kube-scheduling-label-value` | | | Label value for node selection | +| `--kube-ingress-host` | | | Hostname for ingress rules | +| `--kube-ingress-secret` | | | TLS secret for HTTPS ingress | + +### Monitoring Flags +[Top](#table-of-contents) + +Flags for configuring monitoring: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--start-metrics-collector` | | false | Start prometheus collector | +| `--start-logs-collector` | | false | Start promtail collector | +| `--stop-metrics-collector` | | false | Stop prometheus collector | +| `--stop-logs-collector` | | false | Stop promtail collector | + +### Network Control Flags +[Top](#table-of-contents) + +Flags for controlling network lifecycle: + +| Flag | Environment Variable | Default | Description | +|:-----|:--------------------|:--------|:------------| +| `--start-network` | | false | Start a new network | +| `--stop-network` | | false | Stop the network | +| `--restart-network` | | false | Restart network nodes | +| `--reuse-network` | | false | Reuse existing network | + ## Configuration on disk [Top](#table-of-contents) diff --git a/tests/fixture/e2e/flags.go b/tests/fixture/tmpnet/flags/flag_vars.go similarity index 94% rename from tests/fixture/e2e/flags.go rename to tests/fixture/tmpnet/flags/flag_vars.go index 63dc048e8294..ec55ecc68151 100644 --- a/tests/fixture/e2e/flags.go +++ b/tests/fixture/tmpnet/flags/flag_vars.go @@ -1,7 +1,7 @@ // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. // See the file LICENSE for licensing terms. -package e2e +package flags import ( "errors" @@ -13,7 +13,6 @@ import ( "github.com/spf13/cast" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" - "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" ) type NetworkCmd int @@ -28,9 +27,9 @@ const ( type FlagVars struct { startNetwork bool - startNetworkVars *flags.StartNetworkVars + startNetworkVars *StartNetworkVars - collectorVars *flags.CollectorVars + collectorVars *CollectorVars checkMetricsCollected bool checkLogsCollected bool @@ -138,9 +137,9 @@ func RegisterFlagsWithDefaultOwner(defaultOwner string) *FlagVars { "[optional] start a new network and exit without executing any tests. The new network cannot be reused with --reuse-network.", ) - vars.startNetworkVars = flags.NewStartNetworkFlagVars(defaultOwner) + vars.startNetworkVars = NewStartNetworkFlagVars(defaultOwner) - vars.collectorVars = flags.NewCollectorFlagVars() + vars.collectorVars = NewCollectorFlagVars() SetCheckCollectionFlags( &vars.checkMetricsCollected, diff --git a/tests/fixture/tmpnet/flags/kube_runtime.go b/tests/fixture/tmpnet/flags/kube_runtime.go index cc5304cb7952..3eee81ea99d9 100644 --- a/tests/fixture/tmpnet/flags/kube_runtime.go +++ b/tests/fixture/tmpnet/flags/kube_runtime.go @@ -20,10 +20,9 @@ const ( ) var ( - errKubeNamespaceRequired = errors.New("--kube-namespace is required") - errKubeImageRequired = errors.New("--kube-image is required") - errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) - errKubeSchedulingLabelRequired = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when --kube-use-exclusive-scheduling is enabled") + errKubeNamespaceRequired = errors.New("--kube-namespace is required") + errKubeImageRequired = errors.New("--kube-image is required") + errKubeMinVolumeSizeRequired = fmt.Errorf("--kube-volume-size must be >= %d", tmpnet.MinimumVolumeSizeGB) ) type kubeRuntimeVars struct { @@ -77,13 +76,13 @@ func (v *kubeRuntimeVars) register(stringVar varFunc[string], uintVar varFunc[ui stringVar( &v.schedulingLabelKey, "kube-scheduling-label-key", - "purpose", + "", kubeDocPrefix+"The label key to use for exclusive scheduling for node selection and toleration", ) stringVar( &v.schedulingLabelValue, "kube-scheduling-label-value", - "higher-spec", + "", kubeDocPrefix+"The label value to use for exclusive scheduling for node selection and toleration", ) } @@ -98,9 +97,6 @@ func (v *kubeRuntimeVars) getKubeRuntimeConfig() (*tmpnet.KubeRuntimeConfig, err if v.volumeSizeGB < tmpnet.MinimumVolumeSizeGB { return nil, errKubeMinVolumeSizeRequired } - if v.useExclusiveScheduling && (len(v.schedulingLabelKey) == 0 || len(v.schedulingLabelValue) == 0) { - return nil, errKubeSchedulingLabelRequired - } return &tmpnet.KubeRuntimeConfig{ ConfigPath: v.config.Path, ConfigContext: v.config.Context, diff --git a/tests/fixture/tmpnet/kube_runtime.go b/tests/fixture/tmpnet/kube_runtime.go index c3bb4e72e4ab..ab86956291e3 100644 --- a/tests/fixture/tmpnet/kube_runtime.go +++ b/tests/fixture/tmpnet/kube_runtime.go @@ -16,13 +16,16 @@ import ( "go.uber.org/zap" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "github.com/ava-labs/avalanchego/config" "github.com/ava-labs/avalanchego/ids" + "github.com/ava-labs/avalanchego/utils/logging" corev1 "k8s.io/api/core/v1" + networkingv1 "k8s.io/api/networking/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" restclient "k8s.io/client-go/rest" @@ -49,6 +52,15 @@ const ( // are never scheduled to the same nodes. antiAffinityLabelKey = "tmpnet-scheduling" antiAffinityLabelValue = "exclusive" + + // Name of config map containing tmpnet defaults + defaultsConfigMapName = "tmpnet-defaults" + ingressHostKey = "ingressHost" +) + +var ( + errMissingSchedulingLabels = errors.New("--kube-scheduling-label-key and --kube-scheduling-label-value are required when exclusive scheduling is enabled") + errMissingIngressHost = errors.New("IngressHost is a required value. Ensure the " + defaultsConfigMapName + " ConfigMap contains an entry for " + ingressHostKey) ) type KubeRuntimeConfig struct { @@ -70,10 +82,86 @@ type KubeRuntimeConfig struct { SchedulingLabelKey string `json:"schedulingLabelKey,omitempty"` // Label value to use for exclusive scheduling for node selection and toleration SchedulingLabelValue string `json:"schedulingLabelValue,omitempty"` + // Host for ingress rules (e.g., "localhost:30791" for kind, "tmpnet.example.com" for EKS) + IngressHost string `json:"ingressHost,omitempty"` + // TLS secret name for ingress (empty for HTTP, populated for HTTPS) + IngressSecret string `json:"ingressSecret,omitempty"` +} + +// ensureDefaults sets cluster-specific defaults for fields not already set by flags. +func (c *KubeRuntimeConfig) ensureDefaults(ctx context.Context, log logging.Logger) error { + // Only read defaults if necessary + requireSchedulingDefaults := c.UseExclusiveScheduling && (len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0) + requireIngressDefaults := !IsRunningInCluster() && len(c.IngressHost) == 0 + if !requireSchedulingDefaults && !requireIngressDefaults { + return nil + } + + clientset, err := GetClientset(log, c.ConfigPath, c.ConfigContext) + if err != nil { + return err + } + + log.Info("attempting to retrieve configmap containing tmpnet defaults", + zap.String("namespace", c.Namespace), + zap.String("configMap", defaultsConfigMapName), + ) + + configMap, err := clientset.CoreV1().ConfigMaps(c.Namespace).Get(ctx, defaultsConfigMapName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get ConfigMap: %w", err) + } + + if requireSchedulingDefaults { + var ( + schedulingLabelKey = configMap.Data["schedulingLabelKey"] + schedulingLabelValue = configMap.Data["schedulingLabelValue"] + ) + if len(c.SchedulingLabelKey) == 0 && len(schedulingLabelKey) > 0 { + log.Info("setting default value for SchedulingLabelKey", + zap.String("schedulingLabelKey", schedulingLabelKey), + ) + c.SchedulingLabelKey = schedulingLabelKey + } + if len(c.SchedulingLabelValue) == 0 && len(schedulingLabelValue) > 0 { + log.Info("setting default value for SchedulingLabelValue", + zap.String("schedulingLabelValue", schedulingLabelValue), + ) + c.SchedulingLabelValue = schedulingLabelValue + } + if len(c.SchedulingLabelKey) == 0 || len(c.SchedulingLabelValue) == 0 { + return errMissingSchedulingLabels + } + } + if requireIngressDefaults { + var ( + ingressHost = configMap.Data[ingressHostKey] + ingressSecret = configMap.Data["ingressSecret"] + ) + if len(c.IngressHost) == 0 && len(ingressHost) > 0 { + log.Info("setting default value for IngressHost", + zap.String("ingressHost", ingressHost), + ) + c.IngressHost = ingressHost + } + if len(c.IngressSecret) == 0 && len(ingressSecret) > 0 { + log.Info("setting default value for IngressSecret", + zap.String("ingressSecret", ingressSecret), + ) + c.IngressSecret = ingressSecret + } + if len(c.IngressHost) == 0 { + return errMissingIngressHost + } + } + + return nil } type KubeRuntime struct { node *Node + + kubeConfig *restclient.Config } // readState reads the URI and staking address for the node if the node is running. @@ -92,6 +180,11 @@ func (p *KubeRuntime) readState(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) + // Validate that it will be possible to construct accessible URIs when running external to the kube cluster + if !IsRunningInCluster() && len(runtimeConfig.IngressHost) == 0 { + return errors.New("IngressHost must be set when running outside of the kubernetes cluster") + } + clientset, err := p.getClientset() if err != nil { return err @@ -134,31 +227,34 @@ func (p *KubeRuntime) readState(ctx context.Context) error { return nil } -// GetLocalURI retrieves a URI for the node intended to be accessible from this -// process until the provided cancel function is called. -func (p *KubeRuntime) GetLocalURI(ctx context.Context) (string, func(), error) { - if len(p.node.URI) == 0 { - // Assume that an empty URI indicates a need to read pod state - if err := p.readState(ctx); err != nil { - return "", func() {}, fmt.Errorf("failed to read Pod state: %w", err) - } - } - - // Use direct pod URI if running inside the cluster +// GetAccessibleURI retrieves a URI for the node accessible from where +// this process is running. If the process is running inside a kube +// cluster, the node and the process will be assumed to be running in the +// same kube cluster and the node's URI be used. If the process is +// running outside of a kube cluster, a URI accessible from outside of +// the cluster will be used. +func (p *KubeRuntime) GetAccessibleURI() string { if IsRunningInCluster() { - return p.node.URI, func() {}, nil + return p.node.URI } - port, stopChan, err := p.forwardPort(ctx, config.DefaultHTTPPort) - if err != nil { - return "", nil, err + var ( + protocol = "http" + nodeID = p.node.NodeID.String() + networkUUID = p.node.network.UUID + runtimeConfig = p.runtimeConfig() + ) + // Assume tls is configured for an ingress secret + if len(runtimeConfig.IngressSecret) > 0 { + protocol = "https" } - return fmt.Sprintf("http://127.0.0.1:%d", port), func() { close(stopChan) }, nil + + return fmt.Sprintf("%s://%s/networks/%s/%s", protocol, runtimeConfig.IngressHost, networkUUID, nodeID) } -// GetLocalStakingAddress retrieves a StakingAddress for the node intended to be +// GetAccessibleStakingAddress retrieves a StakingAddress for the node intended to be // accessible from this process until the provided cancel function is called. -func (p *KubeRuntime) GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { +func (p *KubeRuntime) GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { if p.node.StakingAddress == (netip.AddrPort{}) { // Assume that an empty staking address indicates a need to retrieve pod state if err := p.readState(ctx); err != nil { @@ -311,6 +407,23 @@ func (p *KubeRuntime) Start(ctx context.Context) error { zap.String("statefulSet", statefulSetName), ) + if !IsRunningInCluster() { + // If running outside the cluster, ensure the node's API port is accessible via ingress + + serviceName := "s-" + statefulSetName // The 's-' prefix ensures DNS compatibility + if err := p.createNodeService(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Service for node: %w", err) + } + + if err := p.createNodeIngress(ctx, serviceName); err != nil { + return fmt.Errorf("failed to create Ingress for node: %w", err) + } + + if err := p.waitForIngressReadiness(ctx, serviceName); err != nil { + return fmt.Errorf("failed to wait for Ingress readiness: %w", err) + } + } + return p.ensureBootstrapIP(ctx) } @@ -571,9 +684,6 @@ func (p *KubeRuntime) Restart(ctx context.Context) error { } // IsHealthy checks if the node is running and healthy. -// -// TODO(marun) Add WaitForHealthy as a runtime method to minimize API calls required and -// enable reuse of forwarded connection when running external to the kubernetes cluster func (p *KubeRuntime) IsHealthy(ctx context.Context) (bool, error) { err := p.readState(ctx) if err != nil { @@ -583,13 +693,7 @@ func (p *KubeRuntime) IsHealthy(ctx context.Context) (bool, error) { return false, errNotRunning } - uri, cancel, err := p.GetLocalURI(ctx) - if err != nil { - return false, err - } - defer cancel() - - healthReply, err := CheckNodeHealth(ctx, uri) + healthReply, err := CheckNodeHealth(ctx, p.GetAccessibleURI()) if errors.Is(err, ErrUnrecoverableNodeHealthCheck) { return false, err } else if err != nil { @@ -716,13 +820,23 @@ func (p *KubeRuntime) runtimeConfig() *KubeRuntimeConfig { return p.node.getRuntimeConfig().Kube } +// getKubeconfig retrieves the kubeconfig for the target cluster. It +// will be cached after the first call to avoid unnecessary logging +// when running in-cluster. func (p *KubeRuntime) getKubeconfig() (*restclient.Config, error) { - runtimeConfig := p.runtimeConfig() - return GetClientConfig( - p.node.network.log, - runtimeConfig.ConfigPath, - runtimeConfig.ConfigContext, - ) + if p.kubeConfig == nil { + runtimeConfig := p.runtimeConfig() + config, err := GetClientConfig( + p.node.network.log, + runtimeConfig.ConfigPath, + runtimeConfig.ConfigContext, + ) + if err != nil { + return nil, fmt.Errorf("failed to get kubeconfig: %w", err) + } + p.kubeConfig = config + } + return p.kubeConfig, nil } func (p *KubeRuntime) getClientset() (*kubernetes.Clientset, error) { @@ -789,6 +903,10 @@ func (p *KubeRuntime) getFlags() (FlagsMap, error) { flags[config.DataDirKey] = volumeMountPath // The node must bind to the Pod IP to enable the kubelet to access the http port for the readiness check flags[config.HTTPHostKey] = "0.0.0.0" + // Ensure compatibility with a non-localhost ingress host + if !IsRunningInCluster() && !strings.HasPrefix(p.runtimeConfig().IngressHost, "localhost") { + flags[config.HTTPAllowedHostsKey] = p.runtimeConfig().IngressHost + } return flags, nil } @@ -837,6 +955,310 @@ func configureExclusiveScheduling(template *corev1.PodTemplateSpec, labelKey str } } +// createNodeService creates a Kubernetes Service for the node to enable ingress routing +func (p *KubeRuntime) createNodeService(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + ) + + log.Debug("creating Service for node", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + service := &corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: namespace, + Labels: map[string]string{ + "app": serviceName, + "network-uuid": p.node.network.UUID, + "node-id": nodeID, + }, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{ + "network_uuid": p.node.network.UUID, + "node_id": nodeID, + }, + Ports: []corev1.ServicePort{ + { + Name: "http", + Port: config.DefaultHTTPPort, + TargetPort: intstr.FromInt(config.DefaultHTTPPort), + Protocol: corev1.ProtocolTCP, + }, + }, + Type: corev1.ServiceTypeClusterIP, + }, + } + + _, err = clientset.CoreV1().Services(namespace).Create(ctx, service, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Service: %w", err) + } + + log.Debug("created Service", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + return nil +} + +// createNodeIngress creates a Kubernetes Ingress for the node to enable external access +func (p *KubeRuntime) createNodeIngress(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + networkUUID = p.node.network.UUID + ) + + log.Debug("creating Ingress for node", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + var ( + ingressClassName = "nginx" // Assume nginx ingress controller + // Path pattern: /networks//(/|$)(.*) + // Using (/|$)(.*) to properly handle trailing slashes + pathPattern = fmt.Sprintf("/networks/%s/%s", networkUUID, nodeID) + "(/|$)(.*)" + pathType = networkingv1.PathTypeImplementationSpecific + ) + + // Build the ingress rules + ingressRules := []networkingv1.IngressRule{ + { + IngressRuleValue: networkingv1.IngressRuleValue{ + HTTP: &networkingv1.HTTPIngressRuleValue{ + Paths: []networkingv1.HTTPIngressPath{ + { + Path: pathPattern, + PathType: &pathType, + Backend: networkingv1.IngressBackend{ + Service: &networkingv1.IngressServiceBackend{ + Name: serviceName, + Port: networkingv1.ServiceBackendPort{ + Number: config.DefaultHTTPPort, + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Add host if not localhost + if !strings.HasPrefix(runtimeConfig.IngressHost, "localhost") { + ingressRules[0].Host = runtimeConfig.IngressHost + } + + ingress := &networkingv1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: serviceName, + Namespace: namespace, + Labels: map[string]string{ + "app": serviceName, + "network-uuid": networkUUID, + "node-id": nodeID, + }, + Annotations: map[string]string{ + "nginx.ingress.kubernetes.io/use-regex": "true", + "nginx.ingress.kubernetes.io/rewrite-target": "/$2", + "nginx.ingress.kubernetes.io/proxy-body-size": "0", + "nginx.ingress.kubernetes.io/proxy-read-timeout": "600", + "nginx.ingress.kubernetes.io/proxy-send-timeout": "600", + }, + }, + Spec: networkingv1.IngressSpec{ + IngressClassName: &ingressClassName, + Rules: ingressRules, + }, + } + + // Add TLS configuration if IngressSecret is set + if len(runtimeConfig.IngressSecret) > 0 && !strings.HasPrefix(runtimeConfig.IngressHost, "localhost") { + ingress.Spec.TLS = []networkingv1.IngressTLS{ + { + Hosts: []string{runtimeConfig.IngressHost}, + SecretName: runtimeConfig.IngressSecret, + }, + } + } + + _, err = clientset.NetworkingV1().Ingresses(namespace).Create(ctx, ingress, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create Ingress: %w", err) + } + + log.Debug("created Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + zap.String("path", pathPattern), + ) + + return nil +} + +// waitForIngressReadiness waits for the ingress to be ready and able to route traffic +// This prevents 503 errors when health checks are performed immediately after node start +func (p *KubeRuntime) waitForIngressReadiness(ctx context.Context, serviceName string) error { + var ( + log = p.node.network.log + nodeID = p.node.NodeID.String() + runtimeConfig = p.runtimeConfig() + namespace = runtimeConfig.Namespace + ) + + log.Debug("waiting for Ingress readiness", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + + clientset, err := p.getClientset() + if err != nil { + return err + } + + // Wait for the ingress to exist, be processed by the controller, and service endpoints to be available + err = wait.PollUntilContextCancel( + ctx, + statusCheckInterval, + true, // immediate + func(ctx context.Context) (bool, error) { + // Check if ingress exists and is processed by the controller + ingress, err := clientset.NetworkingV1().Ingresses(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + log.Verbo("waiting for Ingress to be created", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + if err != nil { + log.Warn("failed to retrieve Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + zap.Error(err), + ) + return false, nil + } + + // Check if ingress controller has processed the ingress + // The ingress controller should populate the Status.LoadBalancer.Ingress field + // when it has successfully processed and exposed the ingress + hasIngressIP := len(ingress.Status.LoadBalancer.Ingress) > 0 + if !hasIngressIP { + log.Verbo("waiting for Ingress controller to process and expose the Ingress", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + + // Validate that at least one ingress has an IP or hostname + hasValidIngress := false + for _, ing := range ingress.Status.LoadBalancer.Ingress { + if ing.IP != "" || ing.Hostname != "" { + hasValidIngress = true + break + } + } + + if !hasValidIngress { + log.Verbo("waiting for Ingress controller to assign IP or hostname", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return false, nil + } + + // Check if service endpoints are available + endpoints, err := clientset.CoreV1().Endpoints(namespace).Get(ctx, serviceName, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + log.Verbo("waiting for Service endpoints to be created", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + return false, nil + } + if err != nil { + log.Warn("failed to retrieve Service endpoints", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + zap.Error(err), + ) + return false, nil + } + + // Check if endpoints have at least one ready address + hasReadyEndpoints := false + for _, subset := range endpoints.Subsets { + if len(subset.Addresses) > 0 { + hasReadyEndpoints = true + break + } + } + + if !hasReadyEndpoints { + log.Verbo("waiting for Service endpoints to have ready addresses", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("service", serviceName), + ) + return false, nil + } + + log.Debug("Ingress is exposed by controller and Service endpoints are ready", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + return true, nil + }, + ) + if err != nil { + return fmt.Errorf("failed to wait for Ingress %s/%s readiness: %w", namespace, serviceName, err) + } + + log.Debug("Ingress is ready", + zap.String("nodeID", nodeID), + zap.String("namespace", namespace), + zap.String("ingress", serviceName), + ) + + return nil +} + // IsRunningInCluster detects if this code is running inside a Kubernetes cluster // by checking for the presence of the service account token that's automatically // mounted in every pod. diff --git a/tests/fixture/tmpnet/monitor_kube.go b/tests/fixture/tmpnet/monitor_kube.go index 1bd5cc11cbf2..9128e55c80c9 100644 --- a/tests/fixture/tmpnet/monitor_kube.go +++ b/tests/fixture/tmpnet/monitor_kube.go @@ -37,7 +37,7 @@ type kubeCollectorConfig struct { } // DeployKubeCollectors deploys collectors of logs and metrics to a Kubernetes cluster. -func DeployKubeCollectors( +func deployKubeCollectors( ctx context.Context, log logging.Logger, configPath string, diff --git a/tests/fixture/tmpnet/network.go b/tests/fixture/tmpnet/network.go index 25c0f6c42b3c..d167db12c48a 100644 --- a/tests/fixture/tmpnet/network.go +++ b/tests/fixture/tmpnet/network.go @@ -177,7 +177,7 @@ func BootstrapNewNetwork( if err := checkVMBinaries(log, network.Subnets, network.DefaultRuntimeConfig.Process); err != nil { return err } - if err := network.EnsureDefaultConfig(log); err != nil { + if err := network.EnsureDefaultConfig(ctx, log); err != nil { return err } if err := network.Create(rootNetworkDir); err != nil { @@ -234,7 +234,14 @@ func ReadNetwork(ctx context.Context, log logging.Logger, dir string) (*Network, } // Initializes a new network with default configuration. -func (n *Network) EnsureDefaultConfig(log logging.Logger) error { +func (n *Network) EnsureDefaultConfig(ctx context.Context, log logging.Logger) error { + // Populate runtime defaults before logging it + if n.DefaultRuntimeConfig.Kube != nil { + if err := n.DefaultRuntimeConfig.Kube.ensureDefaults(ctx, log); err != nil { + return err + } + } + log.Info("preparing configuration for new network", zap.Any("runtimeConfig", n.DefaultRuntimeConfig), ) @@ -442,11 +449,7 @@ func (n *Network) Bootstrap(ctx context.Context, log logging.Logger) error { } // Don't restart the node during subnet creation since it will always be restarted afterwards. - uri, cancel, err := bootstrapNode.GetLocalURI(ctx) - if err != nil { - return err - } - defer cancel() + uri := bootstrapNode.GetAccessibleURI() if err := n.CreateSubnets(ctx, log, uri, false /* restartRequired */); err != nil { return err } @@ -777,11 +780,9 @@ func (n *Network) GetNode(nodeID ids.NodeID) (*Node, error) { return nil, fmt.Errorf("%s is not known to the network", nodeID) } -// GetNodeURIs returns the URIs of nodes in the network that are running and not ephemeral. The URIs -// returned are guaranteed be reachable by the caller until the cleanup function is called regardless -// of whether the nodes are running as local processes or in a kube cluster. -func (n *Network) GetNodeURIs(ctx context.Context, deferCleanupFunc func(func())) ([]NodeURI, error) { - return GetNodeURIs(ctx, n.Nodes, deferCleanupFunc) +// GetNodeURIs returns the accessible URIs of nodes in the network that are running and not ephemeral. +func (n *Network) GetNodeURIs() []NodeURI { + return GetNodeURIs(n.Nodes) } // GetAvailableNodeIDs returns the node IDs of nodes in the network that are running and not ephemeral. @@ -954,7 +955,7 @@ func waitForHealthy(ctx context.Context, log logging.Logger, nodes []*Node) erro unhealthyNodes.Remove(node) log.Info("node is healthy", zap.Stringer("nodeID", node.NodeID), - zap.String("uri", node.URI), + zap.String("uri", node.GetAccessibleURI()), ) } diff --git a/tests/fixture/tmpnet/network_test.go b/tests/fixture/tmpnet/network_test.go index fa789d7e9935..c632b41186b0 100644 --- a/tests/fixture/tmpnet/network_test.go +++ b/tests/fixture/tmpnet/network_test.go @@ -26,7 +26,7 @@ func TestNetworkSerialization(t *testing.T) { network.PrimarySubnetConfig = ConfigMap{ "validatorOnly": true, } - require.NoError(network.EnsureDefaultConfig(logging.NoLog{})) + require.NoError(network.EnsureDefaultConfig(ctx, logging.NoLog{})) require.NoError(network.Create(tmpDir)) // Ensure node runtime is initialized require.NoError(network.readNodes(ctx)) diff --git a/tests/fixture/tmpnet/node.go b/tests/fixture/tmpnet/node.go index a76eb02a7e44..347253356b68 100644 --- a/tests/fixture/tmpnet/node.go +++ b/tests/fixture/tmpnet/node.go @@ -41,8 +41,8 @@ var ( // NodeRuntime defines the methods required to support running a node. type NodeRuntime interface { readState(ctx context.Context) error - GetLocalURI(ctx context.Context) (string, func(), error) - GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) + GetAccessibleURI() string + GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) Start(ctx context.Context) error InitiateStop(ctx context.Context) error WaitForStopped(ctx context.Context) error @@ -199,12 +199,12 @@ func (n *Node) readState(ctx context.Context) error { return n.getRuntime().readState(ctx) } -func (n *Node) GetLocalURI(ctx context.Context) (string, func(), error) { - return n.getRuntime().GetLocalURI(ctx) +func (n *Node) GetAccessibleURI() string { + return n.getRuntime().GetAccessibleURI() } -func (n *Node) GetLocalStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { - return n.getRuntime().GetLocalStakingAddress(ctx) +func (n *Node) GetAccessibleStakingAddress(ctx context.Context) (netip.AddrPort, func(), error) { + return n.getRuntime().GetAccessibleStakingAddress(ctx) } // Writes the current state of the metrics endpoint to disk @@ -213,12 +213,7 @@ func (n *Node) SaveMetricsSnapshot(ctx context.Context) error { // No URI to request metrics from return nil } - baseURI, cancel, err := n.GetLocalURI(ctx) - if err != nil { - return nil - } - defer cancel() - uri := baseURI + "/ext/metrics" + uri := n.GetAccessibleURI() + "/ext/metrics" req, err := http.NewRequestWithContext(ctx, http.MethodGet, uri, nil) if err != nil { return err diff --git a/tests/fixture/tmpnet/process_runtime.go b/tests/fixture/tmpnet/process_runtime.go index 9088f0928210..94d7154c3d59 100644 --- a/tests/fixture/tmpnet/process_runtime.go +++ b/tests/fixture/tmpnet/process_runtime.go @@ -417,11 +417,12 @@ func (p *ProcessRuntime) writeMonitoringConfigFile(name string, config []ConfigM return nil } -func (p *ProcessRuntime) GetLocalURI(_ context.Context) (string, func(), error) { - return p.node.URI, func() {}, nil +// GetAccessibleURI returns the URI that can be used to access the node's API. +func (p *ProcessRuntime) GetAccessibleURI() string { + return p.node.URI } -func (p *ProcessRuntime) GetLocalStakingAddress(_ context.Context) (netip.AddrPort, func(), error) { +func (p *ProcessRuntime) GetAccessibleStakingAddress(_ context.Context) (netip.AddrPort, func(), error) { return p.node.StakingAddress, func() {}, nil } diff --git a/tests/fixture/tmpnet/start_kind_cluster.go b/tests/fixture/tmpnet/start_kind_cluster.go index 1957005c4d21..f8d4299875c7 100644 --- a/tests/fixture/tmpnet/start_kind_cluster.go +++ b/tests/fixture/tmpnet/start_kind_cluster.go @@ -13,6 +13,7 @@ import ( "strings" "go.uber.org/zap" + "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" @@ -37,6 +38,13 @@ const ( // TODO(marun) Check for the presence of the context rather than string matching on this error missingContextMsg = `context "` + KindKubeconfigContext + `" does not exist` + + // Ingress controller constants + ingressNamespace = "ingress-nginx" + ingressReleaseName = "ingress-nginx" + ingressChartRepo = "https://kubernetes.github.io/ingress-nginx" + ingressChartName = "ingress-nginx/ingress-nginx" + ingressControllerName = "ingress-nginx-controller" ) //go:embed yaml/tmpnet-rbac.yaml @@ -96,10 +104,18 @@ func StartKindCluster( return fmt.Errorf("failed to create service account kubeconfig context: %w", err) } - if err := DeployKubeCollectors(ctx, log, configPath, configContext, startMetricsCollector, startLogsCollector); err != nil { + if err := deployKubeCollectors(ctx, log, configPath, configContext, startMetricsCollector, startLogsCollector); err != nil { return fmt.Errorf("failed to deploy kube collectors: %w", err) } + if err := deployIngressController(ctx, log, configPath, configContext); err != nil { + return fmt.Errorf("failed to deploy ingress controller: %w", err) + } + + if err := createDefaultsConfigMap(ctx, log, configPath, configContext, DefaultTmpnetNamespace); err != nil { + return fmt.Errorf("failed to create defaults ConfigMap: %w", err) + } + return nil } @@ -227,13 +243,18 @@ func createServiceAccountKubeconfig( return fmt.Errorf("failed to load kubeconfig: %w", err) } - // Check if the context already exists if _, exists := config.Contexts[newContextName]; exists { - log.Info("service account kubeconfig context already exists", + log.Info("service account kubeconfig context exists, recreating to ensure consistency with cluster state", + zap.String("kubeconfig", configPath), + zap.String("context", newContextName), + zap.String("namespace", namespace), + ) + } else { + log.Info("creating new service account kubeconfig context", + zap.String("kubeconfig", configPath), zap.String("context", newContextName), zap.String("namespace", namespace), ) - return nil } // Get the current context (already verified to exist by StartKindCluster) @@ -279,9 +300,162 @@ func createServiceAccountKubeconfig( } log.Info("created service account kubeconfig context", + zap.String("kubeconfig", configPath), zap.String("context", newContextName), zap.String("namespace", namespace), ) return nil } + +// deployIngressController deploys the nginx ingress controller using Helm. +func deployIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + log.Info("checking if nginx ingress controller is already running") + + isRunning, err := isIngressControllerRunning(ctx, log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to check nginx ingress controller status: %w", err) + } + if isRunning { + log.Info("nginx ingress controller already running") + return nil + } + + log.Info("deploying nginx ingress controller using Helm") + + // Add the helm repo for ingress-nginx + if err := runHelmCommand(ctx, "repo", "add", "ingress-nginx", ingressChartRepo); err != nil { + return fmt.Errorf("failed to add helm repo: %w", err) + } + if err := runHelmCommand(ctx, "repo", "update"); err != nil { + return fmt.Errorf("failed to update helm repos: %w", err) + } + + // Install nginx-ingress with values set directly via flags + // Using fixed nodePort 30791 for cross-platform compatibility + args := []string{ + "install", + ingressReleaseName, + ingressChartName, + "--namespace", ingressNamespace, + "--create-namespace", + "--wait", + "--set", "controller.service.type=NodePort", + // This port value must match the port configured in scripts/kind-with-registry.sh + "--set", "controller.service.nodePorts.http=30791", + "--set", "controller.admissionWebhooks.enabled=false", + "--set", "controller.config.proxy-read-timeout=600", + "--set", "controller.config.proxy-send-timeout=600", + "--set", "controller.config.proxy-body-size=0", + "--set", "controller.config.proxy-http-version=1.1", + "--set", "controller.metrics.enabled=true", + } + + if err := runHelmCommand(ctx, args...); err != nil { + return fmt.Errorf("failed to install nginx-ingress: %w", err) + } + + return waitForIngressController(ctx, log, configPath, configContext) +} + +// isIngressControllerRunning checks if the nginx ingress controller is already running. +func isIngressControllerRunning(ctx context.Context, log logging.Logger, configPath string, configContext string) (bool, error) { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return false, err + } + + // TODO(marun) Handle the case of the deployment being in a failed state + _, err = clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{}) + isRunning := !apierrors.IsNotFound(err) || err == nil + return isRunning, nil +} + +// waitForIngressController waits for the nginx ingress controller to be ready. +func waitForIngressController(ctx context.Context, log logging.Logger, configPath string, configContext string) error { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to get clientset: %w", err) + } + + return wait.PollUntilContextCancel(ctx, statusCheckInterval, true /* immediate */, func(ctx context.Context) (bool, error) { + deployment, err := clientset.AppsV1().Deployments(ingressNamespace).Get(ctx, ingressControllerName, metav1.GetOptions{}) + if err != nil { + log.Debug("failed to get nginx ingress controller deployment", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Error(err), + ) + return false, nil + } + if deployment.Status.ReadyReplicas == 0 { + log.Debug("waiting for nginx ingress controller to become ready", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + zap.Int32("replicas", deployment.Status.Replicas), + ) + return false, nil + } + + log.Info("nginx ingress controller is ready", + zap.String("namespace", ingressNamespace), + zap.String("deployment", ingressControllerName), + zap.Int32("readyReplicas", deployment.Status.ReadyReplicas), + ) + return true, nil + }) +} + +// runHelmCommand runs a Helm command with the given arguments. +func runHelmCommand(ctx context.Context, args ...string) error { + cmd := exec.CommandContext(ctx, "helm", args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + return cmd.Run() +} + +// createDefaultsConfigMap creates a ConfigMap containing defaults for the tmpnet namespace. +func createDefaultsConfigMap(ctx context.Context, log logging.Logger, configPath string, configContext string, namespace string) error { + clientset, err := GetClientset(log, configPath, configContext) + if err != nil { + return fmt.Errorf("failed to get clientset: %w", err) + } + + configMapName := defaultsConfigMapName + + // Check if configmap already exists + _, err = clientset.CoreV1().ConfigMaps(namespace).Get(ctx, configMapName, metav1.GetOptions{}) + if err == nil { + log.Info("defaults ConfigMap already exists", + zap.String("namespace", namespace), + zap.String("configMap", configMapName), + ) + return nil + } + if !apierrors.IsNotFound(err) { + return fmt.Errorf("failed to check for configmap %s/%s: %w", namespace, configMapName, err) + } + + log.Info("creating defaults ConfigMap", + zap.String("namespace", namespace), + zap.String("configMap", configMapName), + ) + + configMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: configMapName, + Namespace: namespace, + }, + Data: map[string]string{ + ingressHostKey: "localhost:30791", + }, + } + + _, err = clientset.CoreV1().ConfigMaps(namespace).Create(ctx, configMap, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create configmap %s/%s: %w", namespace, configMapName, err) + } + + return nil +} diff --git a/tests/fixture/e2e/env.go b/tests/fixture/tmpnet/testenv/test_environment.go similarity index 72% rename from tests/fixture/e2e/env.go rename to tests/fixture/tmpnet/testenv/test_environment.go index 07120a68cb24..2c5f89afae2c 100644 --- a/tests/fixture/e2e/env.go +++ b/tests/fixture/tmpnet/testenv/test_environment.go @@ -1,7 +1,7 @@ // Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. // See the file LICENSE for licensing terms. -package e2e +package testenv import ( "context" @@ -17,6 +17,7 @@ import ( "github.com/ava-labs/avalanchego/tests" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" "github.com/ava-labs/avalanchego/utils/crypto/secp256k1" "github.com/ava-labs/avalanchego/vms/secp256k1fx" ) @@ -77,7 +78,7 @@ func (te *TestEnvironment) Marshal() []byte { } // Initialize a new test environment with a shared network (either pre-existing or newly created). -func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment { +func NewTestEnvironment(tc tests.TestContext, flagVars *flags.FlagVars, desiredNetwork *tmpnet.Network) *TestEnvironment { require := require.New(tc) var network *tmpnet.Network @@ -86,7 +87,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork require.NoError(err) // Consider monitoring flags for any command but stop - if networkCmd != StopNetworkCmd { + if networkCmd != flags.StopNetworkCmd { if flagVars.StartMetricsCollector() { require.NoError(tmpnet.StartPrometheus(tc.DefaultContext(), tc.Log())) } @@ -102,7 +103,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork tc.Log().Warn("unable to check that metrics were collected from an uninitialized network") return } - ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) defer cancel() require.NoError(tmpnet.CheckMetricsExist(ctx, tc.Log(), network.UUID)) }) @@ -114,7 +115,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork tc.Log().Warn("unable to check that logs were collected from an uninitialized network") return } - ctx, cancel := context.WithTimeout(context.Background(), DefaultTimeout) + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) defer cancel() require.NoError(tmpnet.CheckLogsExist(ctx, tc.Log(), network.UUID)) }) @@ -122,7 +123,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } // Attempt to load the network if it may already be running - if networkCmd == StopNetworkCmd || networkCmd == ReuseNetworkCmd || networkCmd == RestartNetworkCmd { + if networkCmd == flags.StopNetworkCmd || networkCmd == flags.ReuseNetworkCmd || networkCmd == flags.RestartNetworkCmd { networkDir := flagVars.NetworkDir() var networkSymlink string // If populated, prompts removal of the referenced symlink if --stop-network is specified if len(networkDir) == 0 { @@ -148,7 +149,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork ) } - if networkCmd == StopNetworkCmd { + if networkCmd == flags.StopNetworkCmd { if len(networkSymlink) > 0 { // Remove the symlink to avoid attempts to reuse the stopped network tc.Log().Info("removing symlink", @@ -167,7 +168,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork os.Exit(0) } - if network != nil && networkCmd == RestartNetworkCmd { + if network != nil && networkCmd == flags.RestartNetworkCmd { require.NoError(network.Restart(tc.DefaultContext())) } } @@ -204,7 +205,7 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork } } - if networkCmd == StartNetworkCmd { + if networkCmd == flags.StartNetworkCmd { os.Exit(0) } @@ -222,40 +223,15 @@ func NewTestEnvironment(tc tests.TestContext, flagVars *FlagVars, desiredNetwork testContext: tc, } - if network.DefaultRuntimeConfig.Process != nil { - // Display node IDs and URIs for process-based networks since the nodes are guaranteed to be network accessible - uris := env.GetNodeURIs() - require.NotEmpty(uris, "network contains no nodes") - tc.Log().Info("network nodes are available", - zap.Any("uris", uris), - ) - } else { - // Only display node IDs for kube-based networks since the nodes may not be network accessible and - // port-forwarded URIs are ephemeral - nodeIDs := network.GetAvailableNodeIDs() - require.NotEmpty(nodeIDs, "network contains no nodes") - tc.Log().Info("network nodes are available. Not showing node URIs since kube nodes may be running remotely.", - zap.Strings("nodeIDs", nodeIDs), - ) - } + uris := network.GetNodeURIs() + require.NotEmpty(uris, "network contains no nodes") + tc.Log().Info("network nodes are available", + zap.Any("uris", uris), + ) return env } -// Retrieve URIs for validator nodes of the shared network. The URIs -// are only guaranteed to be accessible until the environment test -// context is torn down (usually the duration of execution of a single -// test). -func (te *TestEnvironment) GetNodeURIs() []tmpnet.NodeURI { - var ( - tc = te.testContext - network = te.GetNetwork() - ) - uris, err := network.GetNodeURIs(tc.DefaultContext(), tc.DeferCleanup) - require.NoError(tc, err) - return uris -} - // Retrieve a random URI to naively attempt to spread API load across nodes. func (te *TestEnvironment) GetRandomNodeURI() tmpnet.NodeURI { var ( @@ -279,15 +255,10 @@ func (te *TestEnvironment) GetRandomNodeURI() tmpnet.NodeURI { require.NotEmpty(tc, availableNodes, "no available nodes to target") - // Use a local URI for the node to ensure compatibility with kube randomNode := availableNodes[r.Intn(len(availableNodes))] - uri, cancel, err := randomNode.GetLocalURI(tc.DefaultContext()) - require.NoError(tc, err) - tc.DeferCleanup(cancel) - nodeURI := tmpnet.NodeURI{ NodeID: randomNode.NodeID, - URI: uri, + URI: randomNode.GetAccessibleURI(), } tc.Log().Info("targeting random node", zap.Stringer("nodeID", nodeURI.NodeID), @@ -324,6 +295,85 @@ func (te *TestEnvironment) StartPrivateNetwork(network *tmpnet.Network) { network, te.RootNetworkDir, te.PrivateNetworkShutdownDelay, - EmptyNetworkCmd, + flags.EmptyNetworkCmd, ) } + +// Start a temporary network with the provided avalanchego binary. +func StartNetwork( + tc tests.TestContext, + network *tmpnet.Network, + rootNetworkDir string, + shutdownDelay time.Duration, + networkCmd flags.NetworkCmd, +) { + require := require.New(tc) + + nodeCount := len(network.Nodes) + timeout, err := network.DefaultRuntimeConfig.GetNetworkStartTimeout(nodeCount) + require.NoError(err) + tc.Log().Info("waiting for network to start", + zap.Float64("timeoutSeconds", timeout.Seconds()), + ) + ctx := tc.ContextWithTimeout(timeout) + + err = tmpnet.BootstrapNewNetwork( + ctx, + tc.Log(), + network, + rootNetworkDir, + ) + if err != nil { + tc.DeferCleanup(func() { + tc.Log().Info("shutting down network") + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) + defer cancel() + require.NoError(network.Stop(ctx)) + }) + require.NoError(err, "failed to bootstrap network") + } + + tc.Log().Info("network started successfully") + + symlinkPath, err := tmpnet.GetReusableNetworkPathForOwner(network.Owner) + require.NoError(err) + + if networkCmd == flags.ReuseNetworkCmd || networkCmd == flags.RestartNetworkCmd { + // Symlink the path of the created network to the default owner path (e.g. latest_avalanchego-e2e) + // to enable easy discovery for reuse. + require.NoError(os.Symlink(network.Dir, symlinkPath)) + tc.Log().Info("symlinked network dir for reuse", + zap.String("networkDir", network.Dir), + zap.String("symlinkPath", symlinkPath), + ) + } + + tc.DeferCleanup(func() { + if networkCmd == flags.ReuseNetworkCmd || networkCmd == flags.RestartNetworkCmd { + tc.Log().Info("skipping shutdown for network intended for reuse", + zap.String("networkDir", network.Dir), + zap.String("symlinkPath", symlinkPath), + ) + return + } + + if networkCmd == flags.StartNetworkCmd { + tc.Log().Info("skipping shutdown for --start-network", + zap.String("networkDir", network.Dir), + ) + return + } + + if shutdownDelay > 0 { + tc.Log().Info("delaying network shutdown to ensure final metrics scrape", + zap.Duration("delay", shutdownDelay), + ) + time.Sleep(shutdownDelay) + } + + tc.Log().Info("shutting down network") + ctx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) + defer cancel() + require.NoError(network.Stop(ctx)) + }) +} diff --git a/tests/fixture/tmpnet/utils.go b/tests/fixture/tmpnet/utils.go index e80070326900..121fe6a062bb 100644 --- a/tests/fixture/tmpnet/utils.go +++ b/tests/fixture/tmpnet/utils.go @@ -44,6 +44,14 @@ func CheckNodeHealth(ctx context.Context, uri string) (*health.APIReply, error) return nil, err } } + + // Assume `503 Service Unavailable` is the result of the ingress + // for the node not being ready. + // TODO(marun) Update Client.Health() to return a typed error + if err != nil && err.Error() == "received status code: 503" { + return nil, err + } + // Assume all other errors are not recoverable return nil, fmt.Errorf("%w: %w", ErrUnrecoverableNodeHealthCheck, err) } @@ -54,25 +62,18 @@ type NodeURI struct { URI string } -// GetNodeURIs returns the URIs of the provided nodes that are running and not ephemeral. The URIs returned -// are guaranteed be reachable by the caller until the cleanup function is called regardless of whether the -// nodes are running as local processes or in a kube cluster. -func GetNodeURIs(ctx context.Context, nodes []*Node, deferCleanupFunc func(func())) ([]NodeURI, error) { +// GetNodeURIs returns the accessible URIs of the provided nodes that are running and not ephemeral. +func GetNodeURIs(nodes []*Node) []NodeURI { availableNodes := FilterAvailableNodes(nodes) uris := []NodeURI{} for _, node := range availableNodes { - uri, cancel, err := node.GetLocalURI(ctx) - if err != nil { - return nil, err - } - deferCleanupFunc(cancel) uris = append(uris, NodeURI{ NodeID: node.NodeID, - URI: uri, + URI: node.GetAccessibleURI(), }) } - return uris, nil + return uris } // FilteredAvailableNodes filters the provided nodes by whether they are running and not ephemeral. @@ -96,15 +97,10 @@ func FilterAvailableNodes(nodes []*Node) []*Node { // blockchain ID, in the form "ws:///ext/bc//ws". // Ephemeral and stopped nodes are ignored. func GetNodeWebsocketURIs( - ctx context.Context, nodes []*Node, blockchainID string, - deferCleanupFunc func(func()), ) ([]string, error) { - nodeURIs, err := GetNodeURIs(ctx, nodes, deferCleanupFunc) - if err != nil { - return nil, fmt.Errorf("failed to get node URIs: %w", err) - } + nodeURIs := GetNodeURIs(nodes) wsURIs := make([]string, len(nodeURIs)) for i := range nodeURIs { uri, err := url.Parse(nodeURIs[i].URI) diff --git a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml index 0d3056614a85..21db7582fd0f 100644 --- a/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml +++ b/tests/fixture/tmpnet/yaml/tmpnet-rbac.yaml @@ -11,6 +11,7 @@ metadata: name: tmpnet namespace: tmpnet rules: +# Regular usage - apiGroups: ["apps"] resources: ["statefulsets"] verbs: ["get", "create", "update", "patch"] @@ -23,6 +24,19 @@ rules: - apiGroups: [""] resources: ["pods/portforward"] verbs: ["create"] +# Enable external node access via ingress +- apiGroups: ["networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "create"] +- apiGroups: [""] + resources: ["configmaps"] + verbs: ["get"] +- apiGroups: [""] + resources: ["endpoints"] + verbs: ["get"] +- apiGroups: [""] + resources: ["services"] + verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding diff --git a/tests/load/c/main/main.go b/tests/load/c/main/main.go index d1673f4baab9..210c6fff3f3c 100644 --- a/tests/load/c/main/main.go +++ b/tests/load/c/main/main.go @@ -19,6 +19,7 @@ import ( "github.com/ava-labs/avalanchego/tests" "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" "github.com/ava-labs/avalanchego/tests/load" "github.com/ava-labs/avalanchego/tests/load/c" "github.com/ava-labs/avalanchego/tests/load/c/listener" @@ -35,10 +36,10 @@ const ( logPrefix = "avalanchego-load-test" ) -var flagVars *e2e.FlagVars +var flagVars *flags.FlagVars func init() { - flagVars = e2e.RegisterFlags() + flagVars = flags.RegisterFlags() flag.Parse() } @@ -92,7 +93,7 @@ func main() { ) }) - endpoints, err := tmpnet.GetNodeWebsocketURIs(ctx, network.Nodes, blockchainID, tc.DeferCleanup) + endpoints, err := tmpnet.GetNodeWebsocketURIs(network.Nodes, blockchainID) require.NoError(err, "failed †o get node websocket URIs") w := &workload{ diff --git a/tests/load2/main/main.go b/tests/load2/main/main.go index 332c2a844108..a99140d7c7b5 100644 --- a/tests/load2/main/main.go +++ b/tests/load2/main/main.go @@ -13,8 +13,9 @@ import ( "github.com/stretchr/testify/require" "github.com/ava-labs/avalanchego/tests" - "github.com/ava-labs/avalanchego/tests/fixture/e2e" "github.com/ava-labs/avalanchego/tests/fixture/tmpnet" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/flags" + "github.com/ava-labs/avalanchego/tests/fixture/tmpnet/testenv" "github.com/ava-labs/avalanchego/tests/load" "github.com/ava-labs/avalanchego/tests/load2" ) @@ -27,13 +28,13 @@ const ( ) var ( - flagVars *e2e.FlagVars + flagVars *flags.FlagVars loadTimeout time.Duration ) func init() { - flagVars = e2e.RegisterFlags() + flagVars = flags.RegisterFlags() flag.DurationVar( &loadTimeout, @@ -64,10 +65,10 @@ func main() { PreFundedKeys: keys, } - e2e.NewTestEnvironment(tc, flagVars, network) + testenv.NewTestEnvironment(tc, flagVars, network) ctx := tests.DefaultNotifyContext(0, tc.DeferCleanup) - wsURIs, err := tmpnet.GetNodeWebsocketURIs(ctx, network.Nodes, blockchainID, tc.DeferCleanup) + wsURIs, err := tmpnet.GetNodeWebsocketURIs(network.Nodes, blockchainID) require.NoError(err) registry := prometheus.NewRegistry() diff --git a/tests/log.go b/tests/log.go index a431feb3c1ec..134cc9cd1f78 100644 --- a/tests/log.go +++ b/tests/log.go @@ -25,5 +25,6 @@ func LoggerForFormat(prefix string, rawLogFormat string) (logging.Logger, error) if err != nil { return nil, err } - return logging.NewLogger(prefix, logging.NewWrappedCore(logging.Verbo, writeCloser, logFormat.ConsoleEncoder())), nil + // TODO(marun) Make the log level configurable + return logging.NewLogger(prefix, logging.NewWrappedCore(logging.Debug, writeCloser, logFormat.ConsoleEncoder())), nil } diff --git a/tests/upgrade/upgrade_test.go b/tests/upgrade/upgrade_test.go index 1b35f4ed99c9..59658d8263fd 100644 --- a/tests/upgrade/upgrade_test.go +++ b/tests/upgrade/upgrade_test.go @@ -44,7 +44,7 @@ func init() { "avalanchego executable path to upgrade to", ) collectorVars = flags.NewCollectorFlagVars() - e2e.SetCheckCollectionFlags( + flags.SetCheckCollectionFlags( &checkMetricsCollected, &checkLogsCollected, ) @@ -99,7 +99,7 @@ var _ = ginkgo.Describe("[Upgrade]", func() { network, "", /* rootNetworkDir */ shutdownDelay, - e2e.EmptyNetworkCmd, + flags.EmptyNetworkCmd, ) tc.By(fmt.Sprintf("restarting all nodes with %q binary", avalancheGoExecPathToUpgradeTo))