diff --git a/.github/workflows/qualify.yaml b/.github/workflows/qualify.yaml index 5cc9d7700..e65c8c979 100644 --- a/.github/workflows/qualify.yaml +++ b/.github/workflows/qualify.yaml @@ -8,6 +8,7 @@ on: description: "The version that should be qualified" type: string default: "" + # Run one qualification per commit. # This means we can have multiple qualifications of different versions # in parallel but only one qualification of each commit @@ -16,14 +17,33 @@ concurrency: cancel-in-progress: true jobs: + setup: + runs-on: + labels: dre-runner-custom + container: ghcr.io/dfinity/dre/actions-runner:7efd87b0eac3ebd255be7efe00a3b39b0f9e9fc1 + outputs: + matrix: ${{ steps.generate.outputs.output }} + steps: + - id: generate + shell: bash + run: | + sudo apt-get install -y jq + UNIQUE_VERSIONS=$(curl https://rollout-dashboard.ch1-rel1.dfinity.network/api/v1/rollouts | jq -r '.[] | select (.state != "failed") | select (.state != "complete") | .batches | to_entries[] | "\(.value)"' | jq '.subnets[].git_revision' | sort | uniq | jq -s ) + echo "Will qualify starting from versions: ${UNIQUE_VERSIONS}" + echo "output=$(jq -cn --argjson versions "$UNIQUE_VERSIONS" '{version: $versions}')" >> $GITHUB_OUTPUT + qualify: + name: Qualifying ${{ matrix.version }} -> ${{ inputs.version }} + needs: setup + strategy: + matrix: ${{ fromJson(needs.setup.outputs.matrix) }} runs-on: labels: dre-runner-custom container: ghcr.io/dfinity/dre/actions-runner:7efd87b0eac3ebd255be7efe00a3b39b0f9e9fc1 steps: - uses: actions/checkout@v4 with: - repository: 'dfinity/dre' # this needs to be specified so it can be kicked off from the ic repo + repository: "dfinity/dre" # this needs to be specified so it can be kicked off from the ic repo - name: "🔍 Check if the version is set" shell: bash @@ -49,4 +69,4 @@ jobs: run: | mkdir -p ~/.config/dfx/identity/xnet-testing/ echo "${{ secrets.XNET_PRINCIPAL_KEY }}" > ~/.config/dfx/identity/xnet-testing/identity.pem - bazel run //rs/qualifier -- "${{ inputs.version }}" + bazel run //rs/qualifier -- "${{ inputs.version }}" --initial-versions ${{ matrix.version }} diff --git a/Cargo.Bazel.lock b/Cargo.Bazel.lock index 02bc00880..1652f1f01 100644 --- a/Cargo.Bazel.lock +++ b/Cargo.Bazel.lock @@ -1,5 +1,5 @@ { - "checksum": "b595f31cbc3e94b284e51ec8b54de93639b9409006890414e0c0d18f4b917606", + "checksum": "ea5dd38d60356ea002de789a5eb67763d68cb6b2e016ed921f20e9c61a9b1d9d", "crates": { "actix-codec 0.5.2": { "name": "actix-codec", @@ -35361,6 +35361,10 @@ "id": "dirs 5.0.1", "target": "dirs" }, + { + "id": "futures 0.3.30", + "target": "futures" + }, { "id": "ic-nervous-system-common-test-keys 0.9.0", "target": "ic_nervous_system_common_test_keys" diff --git a/Cargo.lock b/Cargo.lock index 4b3092bc6..aab53a540 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7096,6 +7096,7 @@ dependencies = [ "clap 4.5.16", "dirs", "dre", + "futures", "ic-canisters", "ic-management-backend", "ic-management-types", diff --git a/rs/cli/src/qualification/mod.rs b/rs/cli/src/qualification/mod.rs index 286c643ab..79103614d 100644 --- a/rs/cli/src/qualification/mod.rs +++ b/rs/cli/src/qualification/mod.rs @@ -245,7 +245,13 @@ impl QualificationExecutor { step: s, }) .collect_vec(), - step_ctx: StepCtx::new(ctx.dre_ctx, ctx.artifacts, ctx.grafana_endpoint)?, + step_ctx: StepCtx::new( + ctx.dre_ctx, + ctx.artifacts, + ctx.grafana_endpoint, + ctx.from_version.clone(), + ctx.to_version.clone(), + )?, from_version: ctx.from_version, to_version: ctx.to_version, }) diff --git a/rs/cli/src/qualification/util.rs b/rs/cli/src/qualification/util.rs index ffb6b917f..fc46fad0e 100644 --- a/rs/cli/src/qualification/util.rs +++ b/rs/cli/src/qualification/util.rs @@ -31,10 +31,18 @@ pub struct StepCtx { log_path: Option, client: Client, grafana_url: Option, + from_version: String, + to_version: String, } impl StepCtx { - pub fn new(dre_ctx: DreContext, artifacts: Option, grafana_url: Option) -> anyhow::Result { + pub fn new( + dre_ctx: DreContext, + artifacts: Option, + grafana_url: Option, + from_version: String, + to_version: String, + ) -> anyhow::Result { let artifacts_of_run = artifacts.as_ref().map(|t| { if let Err(e) = std::fs::create_dir_all(t) { panic!("Couldn't create dir {}: {:?}", t.display(), e) @@ -53,6 +61,8 @@ impl StepCtx { artifacts: artifacts_of_run, client: ClientBuilder::new().timeout(REQWEST_TIMEOUT).build()?, grafana_url, + from_version: from_version[..6].to_string(), + to_version: to_version[..6].to_string(), }) } @@ -186,8 +196,10 @@ impl StepCtx { fn _print_with_time(&self, message: String, add_new_line: bool) { let current_time = Utc::now(); let formatted = format!( - "[{}]{}{}", + "[{} {} -> {}]{}{}", current_time, + self.from_version, + self.to_version, match add_new_line { true => '\n', false => ' ', diff --git a/rs/qualifier/Cargo.toml b/rs/qualifier/Cargo.toml index 531cd576b..4fc03dfa0 100644 --- a/rs/qualifier/Cargo.toml +++ b/rs/qualifier/Cargo.toml @@ -28,3 +28,4 @@ backon = { workspace = true } chrono.workspace = true indexmap.workspace = true strum.workspace = true +futures.workspace = true diff --git a/rs/qualifier/src/cli.rs b/rs/qualifier/src/cli.rs index 44ca7dbe5..02a96a685 100644 --- a/rs/qualifier/src/cli.rs +++ b/rs/qualifier/src/cli.rs @@ -3,6 +3,7 @@ use std::{path::PathBuf, process::Stdio, str::FromStr}; use clap::Parser; use ic_nervous_system_common_test_keys::TEST_NEURON_1_OWNER_KEYPAIR; +use strum::Display; use tokio::process::Command; const TEST_NEURON_1_IDENTITY_PATH: &str = ".config/dfx/identity/test_neuron_1/identity.pem"; const XNET_TESTING_IDENTITY_PATH: &str = ".config/dfx/identity/xnet-testing/identity.pem"; @@ -13,11 +14,14 @@ pub struct Args { /// Version to qualify pub version_to_qualify: String, - /// Specify a version from which the qualification - /// should start. The default will be the same - /// version as the NNS + /// Specify a list of versions from which the qualification + /// should start. The default will be the same forecasted + /// versions that will endup on mainnet after the active + /// rollout is finished. + /// + /// The information is gathered from https://rollout-dashboard.ch1-rel1.dfinity.network/api/v1/rollouts #[clap(long)] - pub initial_version: Option, + pub initial_versions: Option>, /// Path which contains the layout of the network to /// be deployed. The default value will be a network @@ -41,6 +45,35 @@ pub struct Args { /// A range can be: `4`, `3..`, `..3, `1..3` #[clap(long)] pub step_range: Option, + + /// If there are multiple forecasted versions on the network at + /// the end of an active rollout this controls how the qualification + /// will run. + #[clap(long, default_value_t = QualificationMode::Sequential)] + pub mode: QualificationMode, +} + +#[derive(Display, Clone, clap::ValueEnum)] +#[strum(serialize_all = "snake_case")] +pub enum QualificationMode { + /// Less invasive towards farm, but slower. + /// + /// If default config is used this means 16 vm's + /// Each qualification is run in sequence and + /// observed time for one qualification is roughly + /// 1h 30mins, meaning that if there is more than + /// 2 beginning versions qualification can take up + /// to 5 hours to complete. + Sequential, + /// More invasive towards farm, but faster. + /// + /// If the default config is used this means that + /// qualifier will spin up N amount of networks + /// where N is the number of start versions for + /// qualification. Each network (for the default config) + /// will take 16 vm's meaning that in total qualifier + /// will take 16 * N vm's. + Parallel, } impl Args { diff --git a/rs/qualifier/src/ict_util.rs b/rs/qualifier/src/ict_util.rs index 853f63a18..86aab9a7a 100644 --- a/rs/qualifier/src/ict_util.rs +++ b/rs/qualifier/src/ict_util.rs @@ -1,4 +1,4 @@ -use std::{path::PathBuf, process::Stdio, str::FromStr, time::Duration}; +use std::{path::PathBuf, process::Stdio, time::Duration}; use itertools::Itertools; use log::info; @@ -19,9 +19,8 @@ const KEEPALIVE_PERIOD: Duration = Duration::from_secs(30); const KEEPALIVE_PERIOD_ERROR: Duration = Duration::from_secs(5); pub const FARM_BASE_URL: &str = "https://farm.dfinity.systems"; -pub async fn ict(ic_git: PathBuf, config: String, token: CancellationToken, sender: Sender) -> anyhow::Result<()> { - let ic_config = PathBuf::from_str("/tmp/ic_config.json")?; - std::fs::write(&ic_config, &config)?; +pub async fn ict(ic_git: PathBuf, token: CancellationToken, sender: Sender, artifacts: PathBuf) -> anyhow::Result<()> { + let ic_config = artifacts.join("ic-config.json"); let command = "gitlab-ci/container/container-run.sh"; let args = &[ diff --git a/rs/qualifier/src/main.rs b/rs/qualifier/src/main.rs index 40ba55399..d46699b54 100644 --- a/rs/qualifier/src/main.rs +++ b/rs/qualifier/src/main.rs @@ -1,7 +1,13 @@ -use std::{fmt::Display, path::PathBuf, str::FromStr, time::Duration}; +use std::{ + fmt::Display, + path::{Path, PathBuf}, + str::FromStr, + time::Duration, +}; use clap::Parser; use cli::Args; +use futures::future::join_all; use ict_util::ict; use log::info; use qualify_util::qualify; @@ -33,26 +39,70 @@ async fn main() -> anyhow::Result<()> { info!("Principal key created"); args.ensure_xnet_test_key()?; - // Take in one version and figure out what is the base version - // - // To find the initial version we could take NNS version? - let initial_version = if let Some(ref v) = args.initial_version { - v.to_string() + + let initial_versions = if let Some(ref v) = args.initial_versions { + v } else { - info!("Fetching the forcasted version of NNS which will be used as starting point"); + info!("Fetching the forecasted versions from mainnet which will be used as starting point"); // Fetch the starter versions let start_version_selector = StartVersionSelectorBuilder::new() .with_client(ClientBuilder::new().connect_timeout(Duration::from_secs(30))) .build() .await?; - start_version_selector.get_forcasted_version_for_mainnet_nns()? + &start_version_selector.get_forecasted_versions_from_mainnet()? }; + info!("Initial versions that will be used: {}", initial_versions.join(",")); + + args.ensure_git().await?; + + let artifacts = PathBuf::from_str("/tmp/qualifier-artifacts")?.join(&args.version_to_qualify); + info!("Will store artifacts in: {}", artifacts.display()); + std::fs::create_dir_all(&artifacts)?; + if artifacts.exists() { + info!("Making sure artifact store is empty"); + std::fs::remove_dir_all(&artifacts)?; + std::fs::create_dir(&artifacts)?; + } + + info!("Qualification will run in {} mode", args.mode); + let outcomes = match args.mode { + cli::QualificationMode::Sequential => { + let mut outcomes = vec![]; + for iv in initial_versions { + let current_path = &artifacts.join(format!("from-{}", iv)); + if let Err(e) = std::fs::create_dir(current_path) { + outcomes.push(Err(anyhow::anyhow!(e))) + } + outcomes.push(run_qualification(&args, iv.clone(), current_path, neuron_id, &private_key_pem).await) + } + outcomes + } + cli::QualificationMode::Parallel => { + join_all(initial_versions.iter().map(|iv| async { + let current_path = &artifacts.join(format!("from-{}", iv.clone())); + if let Err(e) = std::fs::create_dir(current_path) { + return Err(anyhow::anyhow!(e)); + }; + run_qualification(&args, iv.clone(), current_path, neuron_id, &private_key_pem).await + })) + .await + } + }; + + let errs = outcomes.iter().filter(|o| o.is_err()).collect::>(); + if !errs.is_empty() { + anyhow::bail!("Overall qualification failed due to one or more sub-qualifications failing:\n{:?}", errs) + } + + Ok(()) +} + +async fn run_qualification(args: &Args, initial_version: String, artifacts: &Path, neuron_id: u64, private_key_pem: &Path) -> anyhow::Result<()> { if initial_version == args.version_to_qualify { - anyhow::bail!("Initial version and version to qualify are the same") + anyhow::bail!("Starting version and version being qualified are the same: {}", args.version_to_qualify) } - info!("Initial version that will be used: {}", initial_version); // Generate configuration for `ict` including the initial version // @@ -83,15 +133,13 @@ async fn main() -> anyhow::Result<()> { "num_unassigned_nodes": 4, "initial_version": "{}" }}"#, - &initial_version + initial_version ); // Validate that the string is valid json serde_json::to_string_pretty(&serde_json::from_str::(&config)?)? }; - info!("Using configuration: \n{}", config); - - args.ensure_git().await?; + info!("[{} -> {}]: Using configuration: \n{}", initial_version, args.version_to_qualify, config); // Run ict and capture its output // @@ -103,33 +151,25 @@ async fn main() -> anyhow::Result<()> { let token = CancellationToken::new(); let (sender, mut receiver) = mpsc::channel(2); - let artifacts = PathBuf::from_str("/tmp/qualifier-artifacts")?.join(&args.version_to_qualify); - info!("Will store artifacts in: {}", artifacts.display()); - std::fs::create_dir_all(&artifacts)?; - if artifacts.exists() { - info!("Making sure artifact store is empty"); - std::fs::remove_dir_all(&artifacts)?; - std::fs::create_dir(&artifacts)?; - } - let mut file = std::fs::File::create_new(artifacts.join("ic-config.json"))?; writeln!(file, "{}", &config)?; + let current_network_name = format!("{}-{}", NETWORK_NAME, initial_version); tokio::select! { - res = ict(args.ic_repo_path.clone(), config, token.clone(), sender) => res?, + res = ict(args.ic_repo_path.clone(), token.clone(), sender, artifacts.to_path_buf()) => res?, res = qualify( &mut receiver, - private_key_pem, + private_key_pem.to_path_buf(), neuron_id, - NETWORK_NAME, - initial_version, + current_network_name.as_str(), + initial_version.to_owned(), args.version_to_qualify.to_string(), - artifacts, - args.step_range + artifacts.to_path_buf(), + args.step_range.clone() ) => res? }; - info!("Finished qualifier run for: {}", args.version_to_qualify); + info!("Finished qualifier run for: {} -> {}", initial_version, args.version_to_qualify); token.cancel(); Ok(()) diff --git a/rs/qualifier/src/version_selector.rs b/rs/qualifier/src/version_selector.rs index aaba92591..49209fb14 100644 --- a/rs/qualifier/src/version_selector.rs +++ b/rs/qualifier/src/version_selector.rs @@ -3,7 +3,7 @@ use std::time::Duration; use backon::{ExponentialBuilder, Retryable}; use itertools::Itertools; use reqwest::{Client, ClientBuilder}; -use rollouts::{RolloutState, Rollouts}; +use rollouts::{Rollout, RolloutState, Rollouts}; pub struct StartVersionSelectorBuilder { client_builder: ClientBuilder, @@ -38,6 +38,7 @@ impl StartVersionSelectorBuilder { } const DASHBOARD_URL: &str = "https://rollout-dashboard.ch1-rel1.dfinity.network/api/v1/rollouts"; +#[allow(dead_code)] const NNS_SUBNET_ID: &str = "tdb26-jop6k-aogll-7ltgs-eruif-6kk7m-qpktf-gdiqx-mxtrf-vb5e6-eqe"; pub struct StartVersionSelector { rollouts: Rollouts, @@ -55,9 +56,8 @@ impl StartVersionSelector { Ok(Self { rollouts }) } - pub fn get_forcasted_version_for_mainnet_nns(&self) -> anyhow::Result { - let rollout = self - .rollouts + fn get_active_rollout(&self) -> anyhow::Result<&Rollout> { + self.rollouts .iter() .filter(|r| r.state > RolloutState::Failed && r.state < RolloutState::Complete) // with this we basically reverse the sorting @@ -69,13 +69,28 @@ impl StartVersionSelector { .ok_or(anyhow::anyhow!( "No active rollouts found in the API. All rollouts: \n{:#?}", self.rollouts - ))?; + )) + } + + pub fn _get_forecasted_version_for_mainnet_nns(&self) -> anyhow::Result { + let rollout = self.get_active_rollout()?; rollout .batches .iter() .find_map(|(_, b)| b.subnets.iter().find(|s| s.subnet_id.eq(NNS_SUBNET_ID)).cloned().map(|s| s.git_revision)) .ok_or(anyhow::anyhow!("Couldn't find NNS in the active rollout: \n{:#?}", rollout)) } + + pub fn get_forecasted_versions_from_mainnet(&self) -> anyhow::Result> { + Ok(self + .get_active_rollout()? + .batches + .iter() + .flat_map(|(_, batch)| batch.subnets.iter().map(|s| s.git_revision.clone())) + .sorted() + .dedup() + .collect()) + } } // TODO: replace with dre-airflow once its public