diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index a25cfc5f59..eb2a27df93 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -21,6 +21,7 @@ from functools import partial from itertools import product from shutil import copyfile +from time import sleep from traceback import format_tb from typing import Any, Dict, List, Optional, Union @@ -366,6 +367,103 @@ def _setup_custom_logger(log_file): logger.addHandler(file_handler) +class SharedClusterDetectionTimeoutError(Exception): + """Custom exception for shared cluster detection timeout.""" + + pass + + +class ClusterManager: + """Cluster Manager for shared cluster fixture to avoid AttributeError: Can't pickle local object""" + + def __init__(self, request, factory): + self.request = request + self.factory = factory + + def cluster_factory( + self, + cluster_config, + region, + instance, + os, + scheduler, + upper_case_cluster_name=False, + custom_cli_credentials=None, + **kwargs, + ): + """Create cluster or use existing cluster.""" + cluster_key = f"{region}-{instance}-{os}-{scheduler}" + request = self.request + factory = self.factory + logging.info( + "Eligible for using shared cluster, start to detect." + if cluster_key in request.session.shared_existing_clusters_started_to_create + else "Start to create shared cluster for specific region, instance type, os and scheduler" + ) + if cluster_key in request.session.shared_existing_clusters_started_to_create: + for retry in range(40): + if cluster_key in request.session.shared_existing_clusters: + logging.info( + f"Shared cluster {request.session.shared_existing_clusters[cluster_key].name} detected." + ) + return request.session.shared_existing_clusters[cluster_key] + else: + logging.info(f"Shared cluster not detected yet. Retrying... ({retry + 1}/40)") + sleep(60) + raise SharedClusterDetectionTimeoutError( + "Timeout: Failed to detect the shared cluster within the allowed retries." + ) + + request.session.shared_existing_clusters_started_to_create.add(cluster_key) + cluster_config = _write_config_to_outdir(request, cluster_config, "clusters_configs") + cluster = Cluster( + name=( + request.config.getoption("cluster") + if request.config.getoption("cluster") + else "integ-tests-{0}{1}{2}".format( + random_alphanumeric().upper() if upper_case_cluster_name else random_alphanumeric(), + "-" if request.config.getoption("stackname_suffix") else "", + request.config.getoption("stackname_suffix"), + ) + ), + config_file=cluster_config, + ssh_key=request.config.getoption("key_path"), + region=region, + custom_cli_credentials=custom_cli_credentials, + ) + if not request.config.getoption("cluster"): + cluster.creation_response = factory.create_cluster(cluster, **kwargs) + request.session.shared_existing_clusters[cluster_key] = cluster + return cluster + + +@xdist_session_fixture(autouse=True) +@pytest.mark.usefixtures("setup_credentials") +def shared_clusters_factory(request): + """ + Define a fixture to manage the creation and destruction of session shared clusters. + + The configs used to create clusters are dumped to output_dir/clusters_configs/{test_name}.config + """ + factory = ClustersFactory(delete_logs_on_success=request.config.getoption("delete_logs_on_success")) + + if not hasattr(request.session, "shared_existing_clusters"): + logging.info("Setting shared_existing_clusters_started_to_create and shared_existing_clusters") + request.session.shared_existing_clusters = {} + request.session.shared_existing_clusters_started_to_create = set() + + manager = ClusterManager(request, factory) + + yield manager + + if not request.config.getoption("no_delete"): + try: + test_passed = request.node.rep_call.passed + except AttributeError: + test_passed = False + factory.destroy_all_clusters(test_passed=test_passed) + + @pytest.fixture(scope="class") @pytest.mark.usefixtures("setup_credentials") def clusters_factory(request, region): @@ -509,9 +607,21 @@ def _write_config_to_outdir(request, config, config_dir): out_dir = request.config.getoption("output_dir") # Sanitize config file name to make it Windows compatible - # request.node.nodeid example: + # class scope request.node.nodeid example: # 'dcv/test_dcv.py::test_dcv_configuration[eu-west-1-c5.xlarge-centos7-slurm-8443-0.0.0.0/0-/shared]' - test_file, test_name = request.node.nodeid.split("::", 1) + # module scope request.node.nodeid example: + # 'performance_tests/test_starccm_and_openfoam.py' + # TODO: Find a better way to name module_scope_test/session_scope_test + logging.info(f"request.node.nodeid: {request.node.nodeid}") + nodeid_parts = request.node.nodeid.split("::") + if len(nodeid_parts) == 2: + test_file, test_name = nodeid_parts + elif len(nodeid_parts) == 1: + test_file = nodeid_parts[0] + test_name = "module_scope_test" + random_alphanumeric() + else: + raise ValueError(f"Unexpected nodeid format: {request.node.nodeid}") + config_file_name = "{0}-{1}".format(test_file, test_name.replace("/", "_")) os.makedirs( diff --git a/tests/integration-tests/tests/performance_tests/common.py b/tests/integration-tests/tests/performance_tests/common.py index 9e49f9dd85..ac7384875f 100644 --- a/tests/integration-tests/tests/performance_tests/common.py +++ b/tests/integration-tests/tests/performance_tests/common.py @@ -27,6 +27,7 @@ PYTEST_PARAMETERIZE_VALUES = [(NUM_COMPUTE_NODES, 1)] TEST_RUNNER_SCRIPT = "/shared/assets/workloads/scale-test/run-scale-test.sh" ROUND_UP_FACTOR = 100_000_000 +PERF_TEST_DIFFERENCE_TOLERANCE = 3 METRICS = [ dict(name="jobRunTime", unit="ms"), @@ -222,3 +223,29 @@ def write_results_to_output_dir( paths["baseline"]["statistics.json"], paths[candidate_configuration]["statistics.json"], ) + + +def perf_test_difference(observed_value, baseline_value): + percentage_difference = 100 * (observed_value - baseline_value) / baseline_value + return percentage_difference + + +def _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value): + percentage_difference = perf_test_difference(observed_value, baseline_value) + if percentage_difference < 0: + outcome = "improvement" + elif percentage_difference == 0: + outcome = "matching baseline" + elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE: + outcome = "degradation (within tolerance)" + else: + outcome = "degradation (above tolerance)" + performance_degradation[node] = { + "baseline": baseline_value, + "observed": observed_value, + "percentage_difference": percentage_difference, + } + logging.info( + f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " + f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" + ) diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam.py b/tests/integration-tests/tests/performance_tests/test_openfoam.py deleted file mode 100644 index 9689941300..0000000000 --- a/tests/integration-tests/tests/performance_tests/test_openfoam.py +++ /dev/null @@ -1,97 +0,0 @@ -import logging - -import pytest -from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor - -# timeout in seconds -OPENFOAM_INSTALLATION_TIMEOUT = 300 -OPENFOAM_JOB_TIMEOUT = 5400 # Takes long time because during the first time, it's not only execute the job but also -# builds and installs many things -TASK_VCPUS = 36 # vCPUs are cut in a half because multithreading is disabled -BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS = { - "alinux2": {8: 754, 16: 366, 32: 182}, # v3.1.3 - "ubuntu2204": {8: 742, 16: 376, 32: 185}, # v3.7.0 just a placeholder, Ubuntu22.04 not supported - "ubuntu2004": {8: 750, 16: 382, 32: 187}, # v3.1.3 - "centos7": {8: 755, 16: 371, 32: 190}, # v3.1.3 - "rhel8": {8: 742, 16: 376, 32: 185}, # v3.6.0 just a placeholder, RHEL8 not supported - "rocky8": {8: 742, 16: 376, 32: 185}, # v3.8.0 just a placeholder, Rocky8 not supported -} -PERF_TEST_DIFFERENCE_TOLERANCE = 3 - - -def perf_test_difference(observed_value, baseline_value): - percentage_difference = 100 * (observed_value - baseline_value) / baseline_value - return percentage_difference - - -def openfoam_installed(headnode): - cmd = '[ -d "/shared/SubspaceBenchmarks" ]' - try: - headnode.run_remote_command(cmd) - return True - except RemoteCommandExecutionError: - return False - - -@pytest.mark.parametrize( - "number_of_nodes", - [[8, 16, 32]], -) -def test_openfoam( - vpc_stack, - instance, - os, - region, - scheduler, - pcluster_config_reader, - clusters_factory, - number_of_nodes, - test_datadir, -): - cluster_config = pcluster_config_reader(number_of_nodes=max(number_of_nodes)) - cluster = clusters_factory(cluster_config) - logging.info("Cluster Created") - remote_command_executor = RemoteCommandExecutor(cluster) - if not openfoam_installed(remote_command_executor): - logging.info("Installing OpenFOAM") - remote_command_executor.run_remote_script( - str(test_datadir / "openfoam.install.sh"), timeout=OPENFOAM_INSTALLATION_TIMEOUT, hide=False - ) - logging.info("OpenFOAM Installed") - performance_degradation = {} - subspace_benchmarks_dir = "/shared/SubspaceBenchmarks" - for node in number_of_nodes: - logging.info(f"Submitting OpenFOAM job with {node} nodes") - remote_command_executor.run_remote_command( - f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{node}" 2>&1', - additional_files=[str(test_datadir / "openfoam.slurm.sh")], - timeout=OPENFOAM_JOB_TIMEOUT, - ) - perf_test_result = remote_command_executor.run_remote_script( - (str(test_datadir / "openfoam.results.sh")), hide=False - ) - output = perf_test_result.stdout.strip() - observed_value = int(output.split("\n")[-1].strip()) - baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] - logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds") - percentage_difference = perf_test_difference(observed_value, baseline_value) - if percentage_difference < 0: - outcome = "improvement" - elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE: - outcome = "degradation (within tolerance)" - else: - outcome = "degradation (above tolerance)" - performance_degradation[node] = { - "baseline": baseline_value, - "observed": observed_value, - "percentage_difference": percentage_difference, - } - logging.info( - f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " - f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" - ) - - if performance_degradation: - pytest.fail(f"Performance degradation detected: {performance_degradation}") - else: - logging.info("Performance test results show no performance degradation") diff --git a/tests/integration-tests/tests/performance_tests/test_starccm.py b/tests/integration-tests/tests/performance_tests/test_starccm.py deleted file mode 100644 index 83dd9f5d9f..0000000000 --- a/tests/integration-tests/tests/performance_tests/test_starccm.py +++ /dev/null @@ -1,129 +0,0 @@ -import json -import logging - -import boto3 -import pytest -from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor - -from tests.common.utils import assert_no_file_handler_leak, get_compute_ip_to_num_files - -# timeout in seconds -STARCCM_INSTALLATION_TIMEOUT = 1800 -STARCCM_JOB_TIMEOUT = 600 -STARCCM_LICENCE_SECRET = "starccm-license-secret" -TASK_VCPUS = 36 # vCPUs are cut in a half because multithreading is disabled -BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS = { - "alinux2023": {8: 62.414, 16: 31.998, 32: 20.422}, # v3.10.0 - "alinux2": {8: 64.475, 16: 33.173, 32: 17.899}, # v3.1.3 - "ubuntu2204": {8: 75.502, 16: 36.353, 32: 19.688}, # v3.7.0 - "ubuntu2004": {8: 67.384, 16: 36.434, 32: 19.449}, # v3.1.3 - "centos7": {8: 67.838, 16: 36.568, 32: 20.935}, # v3.1.3 - "rhel8": {8: 66.494, 16: 36.154, 32: 20.347}, # v3.6.0 - "rocky8": {8: 66.859, 16: 36.184, 32: 21.090}, # v3.8.0 -} -PERF_TEST_DIFFERENCE_TOLERANCE = 3 - -OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"] - - -def get_starccm_secrets(region_name): - secrets_manager_client = boto3.client("secretsmanager", region_name=region_name) - response = secrets_manager_client.get_secret_value(SecretId=STARCCM_LICENCE_SECRET)["SecretString"] - secrets = json.loads(response) - return secrets["podkey"], secrets["licpath"] - - -def perf_test_difference(observed_value, baseline_value): - percentage_difference = 100 * (observed_value - baseline_value) / baseline_value - return percentage_difference - - -def starccm_installed(headnode): - cmd = "/shared/STAR-CCM+/18.02.008/STAR-CCM+18.02.008/star/bin/starccm+ --version" - try: - headnode.run_remote_command(cmd) - return True - except RemoteCommandExecutionError: - return False - - -@pytest.mark.parametrize( - "number_of_nodes", - [[8, 16, 32]], -) -def test_starccm( - vpc_stack, - instance, - os, - region, - scheduler, - pcluster_config_reader, - clusters_factory, - number_of_nodes, - test_datadir, - scheduler_commands_factory, - s3_bucket_factory, -): - # Create S3 bucket for custom actions scripts - bucket_name = s3_bucket_factory() - s3 = boto3.client("s3") - s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") - - cluster_config = pcluster_config_reader( - bucket_name=bucket_name, - install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, - number_of_nodes=max(number_of_nodes), - ) - cluster = clusters_factory(cluster_config) - logging.info("Cluster Created") - remote_command_executor = RemoteCommandExecutor(cluster) - scheduler_commands = scheduler_commands_factory(remote_command_executor) - init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands) - - if not starccm_installed(remote_command_executor): - logging.info("Installing StarCCM+") - remote_command_executor.run_remote_script( - str(test_datadir / "starccm.install.sh"), timeout=STARCCM_INSTALLATION_TIMEOUT, hide=False - ) - logging.info("StarCCM+ Installed") - podkey, licpath = get_starccm_secrets(region) - performance_degradation = {} - for node in number_of_nodes: - num_of_tasks = node * TASK_VCPUS - result = remote_command_executor.run_remote_command( - f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"', - additional_files=[str(test_datadir / "starccm.slurm.sh")], - ) - logging.info(f"Submitting StarCCM+ job with {node} nodes") - job_id = scheduler_commands.assert_job_submitted(result.stdout) - scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT) - scheduler_commands.assert_job_succeeded(job_id) - perf_test_result = remote_command_executor.run_remote_script( - (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False - ) - observed_value = float(perf_test_result.stdout) - logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds") - baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node] - percentage_difference = perf_test_difference(observed_value, baseline_value) - if percentage_difference < 0: - outcome = "improvement" - elif percentage_difference <= PERF_TEST_DIFFERENCE_TOLERANCE: - outcome = "degradation (within tolerance)" - else: - outcome = "degradation (above tolerance)" - performance_degradation[node] = { - "baseline": baseline_value, - "observed": observed_value, - "percentage_difference": percentage_difference, - } - logging.info( - f"Nodes: {node}, Baseline: {baseline_value} seconds, Observed: {observed_value} seconds, " - f"Percentage difference: {percentage_difference}%, Outcome: {outcome}" - ) - - assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands) - - if performance_degradation: - pytest.fail(f"Performance degradation detected: {performance_degradation}") - else: - logging.info("Performance test results show no performance degradation") diff --git a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py new file mode 100644 index 0000000000..bc0860082c --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam.py @@ -0,0 +1,231 @@ +import json +import logging +from concurrent.futures import ThreadPoolExecutor + +import boto3 +import pytest +from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor + +from tests.common.utils import assert_no_file_handler_leak, get_compute_ip_to_num_files +from tests.performance_tests.common import _log_output_performance_difference + +# timeout in seconds +STARCCM_INSTALLATION_TIMEOUT = 1800 +STARCCM_JOB_TIMEOUT = 600 +STARCCM_LICENCE_SECRET = "starccm-license-secret" + +OPENFOAM_INSTALLATION_TIMEOUT = 300 +OPENFOAM_JOB_TIMEOUT = 5400 # Takes long time because during the first time, it's not only execute the job but also + +TASK_VCPUS = 36 # vCPUs are cut in a half because multithreading is disabled +BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_STARCCM = { + "alinux2023": {8: 62.414, 16: 31.998, 32: 20.422}, # v3.10.0 + "alinux2": {8: 64.475, 16: 33.173, 32: 17.899}, # v3.1.3 + "ubuntu2204": {8: 75.502, 16: 36.353, 32: 19.688}, # v3.7.0 + "ubuntu2004": {8: 67.384, 16: 36.434, 32: 19.449}, # v3.1.3 + "centos7": {8: 67.838, 16: 36.568, 32: 20.935}, # v3.1.3 + "rhel8": {8: 66.494, 16: 36.154, 32: 20.347}, # v3.6.0 + "rocky8": {8: 66.859, 16: 36.184, 32: 21.090}, # v3.8.0 +} + +BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_OPENFOAM = { + "alinux2": {8: 754, 16: 366, 32: 182}, # v3.1.3 + "ubuntu2204": {8: 742, 16: 376, 32: 185}, # v3.7.0 just a placeholder, Ubuntu22.04 not supported + "ubuntu2004": {8: 750, 16: 382, 32: 187}, # v3.1.3 + "centos7": {8: 755, 16: 371, 32: 190}, # v3.1.3 + "rhel8": {8: 742, 16: 376, 32: 185}, # v3.6.0 just a placeholder, RHEL8 not supported + "rocky8": {8: 742, 16: 376, 32: 185}, # v3.8.0 just a placeholder, Rocky8 not supported +} + +OSS_REQUIRING_EXTRA_DEPS = ["alinux2023", "rhel8", "rocky8"] + + +def get_starccm_secrets(region_name): + secrets_manager_client = boto3.client("secretsmanager", region_name=region_name) + response = secrets_manager_client.get_secret_value(SecretId=STARCCM_LICENCE_SECRET)["SecretString"] + secrets = json.loads(response) + return secrets["podkey"], secrets["licpath"] + + +def openfoam_installed(headnode): + cmd = '[ -d "/shared/SubspaceBenchmarks" ]' + try: + headnode.run_remote_command(cmd, log_error=False) + return True + except RemoteCommandExecutionError: + logging.info("OpenFOAM is not installed on the head node.") + return False + + +def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes): + subspace_benchmarks_dir = "/shared/SubspaceBenchmarks" + logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes") + remote_command_executor.run_remote_command( + f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1', + timeout=OPENFOAM_JOB_TIMEOUT, + ) + perf_test_result = remote_command_executor.run_remote_script( + (str(test_datadir / "openfoam.results.sh")), hide=False + ) + output = perf_test_result.stdout.strip() + observed_value = int(output.split("\n")[-1].strip()) + logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds") + return observed_value + + +def starccm_installed(headnode): + cmd = "/shared/STAR-CCM+/18.02.008/STAR-CCM+18.02.008/star/bin/starccm+ --version" + try: + headnode.run_remote_command(cmd, log_error=False) + return True + except RemoteCommandExecutionError: + logging.info("STAR-CCM+ is not installed on the head node.") + return False + + +def run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, number_of_nodes, podkey, licpath): + num_of_tasks = number_of_nodes * TASK_VCPUS + result = remote_command_executor.run_remote_command( + f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"' + ) + logging.info(f"Submitting StarCCM+ job with {number_of_nodes} nodes") + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT) + scheduler_commands.assert_job_succeeded(job_id) + perf_test_result = remote_command_executor.run_remote_script( + (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False + ) + observed_value = float(perf_test_result.stdout) + logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds") + return observed_value + + +@pytest.mark.parametrize( + "number_of_nodes", + [[8, 16, 32]], +) +def test_starccm( + vpc_stack, + instance, + os, + region, + scheduler, + pcluster_config_reader, + shared_clusters_factory, + number_of_nodes, + test_datadir, + scheduler_commands_factory, + s3_bucket_factory, +): + logging.info("start to create s3") + bucket_name = s3_bucket_factory() + s3 = boto3.client("s3") + s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + + cluster_config = pcluster_config_reader( + bucket_name=bucket_name, + install_extra_deps=os in OSS_REQUIRING_EXTRA_DEPS, + number_of_nodes=max(number_of_nodes), + ) + cluster = shared_clusters_factory.cluster_factory(cluster_config, region, instance, os, scheduler) + logging.info("Cluster Created") + + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + init_num_files = get_compute_ip_to_num_files(remote_command_executor, scheduler_commands) + + if not starccm_installed(remote_command_executor): + logging.info("Installing StarCCM+") + remote_command_executor.run_remote_script( + str(test_datadir / "starccm.install.sh"), timeout=STARCCM_INSTALLATION_TIMEOUT, hide=False + ) + logging.info("StarCCM+ Installed") + podkey, licpath = get_starccm_secrets(region) + performance_degradation = {} + + # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel + remote_command_executor._copy_additional_files([str(test_datadir / "starccm.slurm.sh")]) + # Run 8 and 16 node tests in parallel + with ThreadPoolExecutor(max_workers=2) as executor: + future_8 = executor.submit( + run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 8, podkey, licpath + ) + future_16 = executor.submit( + run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 16, podkey, licpath + ) + observed_value_8 = future_8.result() + observed_value_16 = future_16.result() + + # Run 32 node test + observed_value_32 = run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, 32, podkey, licpath) + + # Check results and log performance degradation + for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): + baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_STARCCM[os][node] + _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value) + + assert_no_file_handler_leak(init_num_files, remote_command_executor, scheduler_commands) + + if performance_degradation: + pytest.fail(f"Performance degradation detected: {performance_degradation}") + else: + logging.info("Performance test results show no performance degradation") + + +@pytest.mark.parametrize( + "number_of_nodes", + [[8, 16, 32]], +) +def test_openfoam( + vpc_stack, + instance, + os, + region, + scheduler, + pcluster_config_reader, + shared_clusters_factory, + number_of_nodes, + test_datadir, + s3_bucket_factory, +): + bucket_name = s3_bucket_factory() + s3 = boto3.client("s3") + s3.upload_file(str(test_datadir / "dependencies.install.sh"), bucket_name, "scripts/dependencies.install.sh") + + cluster_config = pcluster_config_reader( + bucket_name=bucket_name, + install_extra_deps=os in number_of_nodes, + number_of_nodes=max(number_of_nodes), + ) + cluster = shared_clusters_factory.cluster_factory(cluster_config, region, instance, os, scheduler) + logging.info("Cluster Created") + remote_command_executor = RemoteCommandExecutor(cluster) + if not openfoam_installed(remote_command_executor): + logging.info("Installing OpenFOAM") + remote_command_executor.run_remote_script( + str(test_datadir / "openfoam.install.sh"), timeout=OPENFOAM_INSTALLATION_TIMEOUT, hide=False + ) + logging.info("OpenFOAM Installed") + performance_degradation = {} + + # Copy additional files in advanced to avoid conflict when running 8 and 16 nodes tests in parallel + remote_command_executor._copy_additional_files([str(test_datadir / "openfoam.slurm.sh")]) + # Run 8 and 16 node tests in parallel + with ThreadPoolExecutor(max_workers=2) as executor: + future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8) + future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16) + observed_value_8 = future_8.result() + observed_value_16 = future_16.result() + + # Run 32 node test + observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32) + + # Check results and log performance degradation + for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): + baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS_OPENFOAM[os][node] + _log_output_performance_difference(node, performance_degradation, observed_value, baseline_value) + + if performance_degradation: + pytest.fail(f"Performance degradation detected: {performance_degradation}") + else: + logging.info("Performance test results show no performance degradation") diff --git a/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/dependencies.install.sh new file mode 100644 index 0000000000..e109f8583d --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/dependencies.install.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# This script installs the necessary software stack for StarCCM+. +# Note: The same cluster is shared by both test_openfoam and test_starccm. +# The cluster will be created by whichever test (test_openfoam or test_starccm) is executed first. +# If test_openfoam is executed first, it will also need to install the required dependencies. +set -ex + +sudo yum install -y libnsl diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.install.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.install.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.install.sh diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.results.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.results.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.results.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.results.sh diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.slurm.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.slurm.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/openfoam.slurm.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/openfoam.slurm.sh diff --git a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/pcluster.config.yaml similarity index 66% rename from tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/pcluster.config.yaml index bf0ea9a2e5..0fce058376 100644 --- a/tests/integration-tests/tests/performance_tests/test_openfoam/test_openfoam/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_openfoam/pcluster.config.yaml @@ -16,12 +16,23 @@ HeadNode: - BucketName: performance-tests-resources-for-parallelcluster KeyName: openfoam/* EnableWriteAccess: false + - BucketName: performance-tests-resources-for-parallelcluster + KeyName: starccm/* + EnableWriteAccess: false +{% if install_extra_deps %} + - BucketName: {{ bucket_name }} + KeyName: scripts/dependencies.install.sh + EnableWriteAccess: false + CustomActions: + OnNodeConfigured: + Script: s3://{{ bucket_name }}/scripts/dependencies.install.sh +{% endif %} Scheduling: Scheduler: slurm SlurmQueues: - Name: q1 ComputeResources: - - Name: c5n18xl-efa + - Name: c5n-18xl-efa InstanceType: {{ instance }} MinCount: {{ number_of_nodes }} MaxCount: {{ number_of_nodes }} @@ -37,6 +48,15 @@ Scheduling: Iam: AdditionalIamPolicies: - Policy: arn:{{partition}}:iam::aws:policy/AmazonSSMManagedInstanceCore # Required to report patching status +{% if install_extra_deps %} + S3Access: + - BucketName: {{ bucket_name }} + KeyName: scripts/dependencies.install.sh + EnableWriteAccess: false + CustomActions: + OnNodeConfigured: + Script: s3://{{ bucket_name }}/scripts/dependencies.install.sh +{% endif %} SharedStorage: - MountDir: /shared Name: shared-fsx diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/dependencies.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/dependencies.install.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/dependencies.install.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/dependencies.install.sh diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/pcluster.config.yaml similarity index 89% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/pcluster.config.yaml index 3c456e7a13..6a317a9767 100644 --- a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/pcluster.config.yaml +++ b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/pcluster.config.yaml @@ -1,6 +1,8 @@ Region: {{ region }} Image: Os: {{ os }} +Imds: + ImdsSupport: v2.0 HeadNode: InstanceType: {{ instance }} Networking: @@ -11,6 +13,9 @@ HeadNode: AdditionalIamPolicies: - Policy: arn:{{partition}}:iam::aws:policy/AmazonSSMManagedInstanceCore #Required to report patching status S3Access: + - BucketName: performance-tests-resources-for-parallelcluster + KeyName: openfoam/* + EnableWriteAccess: false - BucketName: performance-tests-resources-for-parallelcluster KeyName: starccm/* EnableWriteAccess: false @@ -59,5 +64,7 @@ SharedStorage: FsxLustreSettings: StorageCapacity: 2400 DeploymentType: PERSISTENT_1 + AutomaticBackupRetentionDays: 30 + DailyAutomaticBackupStartTime: 00:00 PerUnitStorageThroughput: 100 StorageType: SSD diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.install.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.install.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.install.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.install.sh diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.results.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.results.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.results.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.results.sh diff --git a/tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.slurm.sh b/tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.slurm.sh similarity index 100% rename from tests/integration-tests/tests/performance_tests/test_starccm/test_starccm/starccm.slurm.sh rename to tests/integration-tests/tests/performance_tests/test_starccm_and_openfoam/test_starccm/starccm.slurm.sh