From 4b5a6b3687bf0a8e1e058a9a84e5d6dcdc37d943 Mon Sep 17 00:00:00 2001 From: Rohin Bhasin Date: Wed, 11 Sep 2024 17:13:28 -0400 Subject: [PATCH] Switch `multinode_cpu_cluster` to `multinode_cpu_docker_conda_cluster`. (#1253) --- tests/conftest.py | 6 +- tests/fixtures/on_demand_cluster_fixtures.py | 8 ++- tests/test_obj_store.py | 2 +- .../test_clusters/test_cluster.py | 2 +- .../test_clusters/test_multinode_cluster.py | 57 +++++++++++-------- .../test_clusters/test_on_demand_cluster.py | 2 +- tests/test_resources/test_envs/test_env.py | 2 +- .../test_secrets/test_secret.py | 2 +- 8 files changed, 46 insertions(+), 35 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 106fd4186..5505db930 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,7 +52,7 @@ class TestCluster(tests.test_resources.test_resource.TestResource): "docker_cluster_pwd_ssh_no_auth", "ondemand_aws_docker_cluster", "static_cpu_pwd_cluster", - "multinode_cpu_cluster" + "multinode_cpu_docker_conda_cluster" ] } @@ -236,7 +236,7 @@ def event_loop(): from tests.fixtures.on_demand_cluster_fixtures import ( a10g_gpu_cluster, # noqa: F401 k80_gpu_cluster, # noqa: F401 - multinode_cpu_cluster, # noqa: F401 + multinode_cpu_docker_conda_cluster, # noqa: F401 multinode_gpu_cluster, # noqa: F401 ondemand_aws_docker_cluster, # noqa: F401 ondemand_aws_https_cluster_with_auth, # noqa: F401 @@ -375,7 +375,7 @@ def event_loop(): "ondemand_k8s_cluster", "ondemand_k8s_docker_cluster", "ondemand_aws_https_cluster_with_auth", - "multinode_cpu_cluster", + "multinode_cpu_docker_conda_cluster", "static_cpu_pwd_cluster", "multinode_gpu_cluster", # for testing cluster status on multinode gpu. ] diff --git a/tests/fixtures/on_demand_cluster_fixtures.py b/tests/fixtures/on_demand_cluster_fixtures.py index cb7057969..cf4aed1de 100644 --- a/tests/fixtures/on_demand_cluster_fixtures.py +++ b/tests/fixtures/on_demand_cluster_fixtures.py @@ -169,12 +169,16 @@ def a10g_gpu_cluster(request): @pytest.fixture(scope="session") -def multinode_cpu_cluster(request): +def multinode_cpu_docker_conda_cluster(request): args = { "name": "rh-cpu-multinode", "num_instances": NUM_OF_INSTANCES, "image_id": "docker:rayproject/ray:latest-py311-cpu", - "default_env": rh.env(reqs=["ray==2.30.0"], working_dir=None), + "default_env": rh.conda_env( + name="default_env", + reqs=test_env().reqs + ["ray==2.30.0"], + conda_env={"dependencies": ["python=3.11"], "name": "default_env"}, + ), "provider": "aws", "instance_type": "CPU:2+", } diff --git a/tests/test_obj_store.py b/tests/test_obj_store.py index ffc41c08f..b5fcf49b4 100644 --- a/tests/test_obj_store.py +++ b/tests/test_obj_store.py @@ -43,7 +43,7 @@ "ondemand_k8s_cluster", "ondemand_k8s_docker_cluster", "ondemand_aws_https_cluster_with_auth", - "multinode_cpu_cluster", + "multinode_cpu_docker_conda_cluster", "static_cpu_pwd_cluster", ] } diff --git a/tests/test_resources/test_clusters/test_cluster.py b/tests/test_resources/test_clusters/test_cluster.py index 81d024d12..cee1ecec9 100644 --- a/tests/test_resources/test_clusters/test_cluster.py +++ b/tests/test_resources/test_clusters/test_cluster.py @@ -124,7 +124,7 @@ class TestCluster(tests.test_resources.test_resource.TestResource): "docker_cluster_pk_ssh_den_auth", "docker_cluster_pwd_ssh_no_auth", "static_cpu_pwd_cluster", - "multinode_cpu_cluster", + "multinode_cpu_docker_conda_cluster", ] } diff --git a/tests/test_resources/test_clusters/test_multinode_cluster.py b/tests/test_resources/test_clusters/test_multinode_cluster.py index 4af4c6213..fde6d4309 100644 --- a/tests/test_resources/test_clusters/test_multinode_cluster.py +++ b/tests/test_resources/test_clusters/test_multinode_cluster.py @@ -9,15 +9,15 @@ class TestMultiNodeCluster: @pytest.mark.level("release") - def test_rsync_and_ssh_onto_worker_node(self, multinode_cpu_cluster): - worker_node = multinode_cpu_cluster.ips[-1] + def test_rsync_and_ssh_onto_worker_node(self, multinode_cpu_docker_conda_cluster): + worker_node = multinode_cpu_docker_conda_cluster.ips[-1] local_rh_package_path = Path(importlib.util.find_spec("runhouse").origin).parent local_rh_package_path = local_rh_package_path.parent dest_path = f"~/{local_rh_package_path.name}" # Rsync Runhouse package onto the worker node - multinode_cpu_cluster.rsync( + multinode_cpu_docker_conda_cluster.rsync( source=str(local_rh_package_path), dest=dest_path, up=True, @@ -25,7 +25,7 @@ def test_rsync_and_ssh_onto_worker_node(self, multinode_cpu_cluster): contents=True, ) - status_codes = multinode_cpu_cluster.run( + status_codes = multinode_cpu_docker_conda_cluster.run( [f"ls -l {dest_path}"], node=worker_node ) assert status_codes[0][0] == 0 @@ -34,11 +34,13 @@ def test_rsync_and_ssh_onto_worker_node(self, multinode_cpu_cluster): @pytest.mark.level("release") def test_ray_started_on_worker_node_after_cluster_restart( - self, multinode_cpu_cluster + self, multinode_cpu_docker_conda_cluster ): - head_node = multinode_cpu_cluster.ips[0] + head_node = multinode_cpu_docker_conda_cluster.ips[0] - status_codes = multinode_cpu_cluster.run(["ray status"], node=head_node) + status_codes = multinode_cpu_docker_conda_cluster.run( + ["ray status"], node=head_node + ) assert status_codes[0][0] == 0 status_output = status_codes[0][1] @@ -47,17 +49,19 @@ def test_ray_started_on_worker_node_after_cluster_restart( assert num_nodes == 2 @pytest.mark.level("release") - def test_send_envs_to_specific_worker_node(self, multinode_cpu_cluster): + def test_send_envs_to_specific_worker_node( + self, multinode_cpu_docker_conda_cluster + ): env_0 = rh.env( name="worker_env_0", reqs=["langchain", "pytest"], - ).to(multinode_cpu_cluster, node_idx=0) + ).to(multinode_cpu_docker_conda_cluster, node_idx=0) env_1 = rh.env( name="worker_env_1", reqs=["torch", "pytest"], - ).to(multinode_cpu_cluster, node_idx=1) + ).to(multinode_cpu_docker_conda_cluster, node_idx=1) env_2 = rh.env( name="worker_env_2", @@ -65,54 +69,57 @@ def test_send_envs_to_specific_worker_node(self, multinode_cpu_cluster): ) with pytest.raises(ValueError): - env_2.to(multinode_cpu_cluster, node_idx=len(multinode_cpu_cluster.ips)) + env_2.to( + multinode_cpu_docker_conda_cluster, + node_idx=len(multinode_cpu_docker_conda_cluster.ips), + ) - env_2.to(multinode_cpu_cluster, node_idx=1) + env_2.to(multinode_cpu_docker_conda_cluster, node_idx=1) get_pid_0 = rh.function(get_pid_and_ray_node).to( - name="get_pid_0", system=multinode_cpu_cluster, env=env_0 + name="get_pid_0", system=multinode_cpu_docker_conda_cluster, env=env_0 ) get_pid_1 = rh.function(get_pid_and_ray_node).to( - name="get_pid_1", system=multinode_cpu_cluster, env=env_1 + name="get_pid_1", system=multinode_cpu_docker_conda_cluster, env=env_1 ) get_pid_2 = rh.function(get_pid_and_ray_node).to( - name="get_pid_2", system=multinode_cpu_cluster, env=env_2 + name="get_pid_2", system=multinode_cpu_docker_conda_cluster, env=env_2 ) assert get_pid_0()[1] != get_pid_1()[1] assert get_pid_1()[1] == get_pid_2()[1] @pytest.mark.level("release") - def test_specifying_resources(self, multinode_cpu_cluster): + def test_specifying_resources(self, multinode_cpu_docker_conda_cluster): env0 = rh.env( name="worker_env_0", compute={"CPU": 1.75}, - ).to(multinode_cpu_cluster) + ).to(multinode_cpu_docker_conda_cluster) env1 = rh.env( name="worker_env_1", compute={"CPU": 0.5}, - ).to(multinode_cpu_cluster) + ).to(multinode_cpu_docker_conda_cluster) env2 = rh.env( name="worker_env_2", compute={"memory": 4 * 1024 * 1024 * 1024}, - ).to(multinode_cpu_cluster) + ).to(multinode_cpu_docker_conda_cluster) env3 = rh.env( name="worker_env_3", compute={"CPU": 0.1, "memory": 2 * 1024 * 1024 * 1024}, - ).to(multinode_cpu_cluster) + ).to(multinode_cpu_docker_conda_cluster) - status = multinode_cpu_cluster.status() + status = multinode_cpu_docker_conda_cluster.status() env0_node = status["env_servlet_processes"][env0.name]["node_ip"] env1_node = status["env_servlet_processes"][env1.name]["node_ip"] env2_node = status["env_servlet_processes"][env2.name]["node_ip"] env3_node = status["env_servlet_processes"][env3.name]["node_ip"] - assert env0_node in multinode_cpu_cluster.internal_ips - assert env1_node in multinode_cpu_cluster.internal_ips - assert env2_node in multinode_cpu_cluster.internal_ips - assert env3_node in multinode_cpu_cluster.internal_ips + assert env0_node in multinode_cpu_docker_conda_cluster.internal_ips + assert env1_node in multinode_cpu_docker_conda_cluster.internal_ips + assert env2_node in multinode_cpu_docker_conda_cluster.internal_ips + assert env3_node in multinode_cpu_docker_conda_cluster.internal_ips assert env0_node != env1_node # Too much CPU assert env2_node != env3_node # Too much memory diff --git a/tests/test_resources/test_clusters/test_on_demand_cluster.py b/tests/test_resources/test_clusters/test_on_demand_cluster.py index 450854995..7df71f149 100644 --- a/tests/test_resources/test_clusters/test_on_demand_cluster.py +++ b/tests/test_resources/test_clusters/test_on_demand_cluster.py @@ -89,7 +89,7 @@ class TestOnDemandCluster(tests.test_resources.test_clusters.test_cluster.TestCl "k80_gpu_cluster", "a10g_gpu_cluster", "static_cpu_pwd_cluster", - "multinode_cpu_cluster", + "multinode_cpu_docker_conda_cluster", "multinode_gpu_cluster", ] } diff --git a/tests/test_resources/test_envs/test_env.py b/tests/test_resources/test_envs/test_env.py index e09b967c1..b7b173c85 100644 --- a/tests/test_resources/test_envs/test_env.py +++ b/tests/test_resources/test_envs/test_env.py @@ -95,7 +95,7 @@ class TestEnv(tests.test_resources.test_resource.TestResource): "ondemand_k8s_docker_cluster", "ondemand_aws_https_cluster_with_auth", "static_cpu_pwd_cluster", - "multinode_cpu_cluster", + "multinode_cpu_docker_conda_cluster", "docker_cluster_pk_ssh_no_auth", "docker_cluster_pwd_ssh_no_auth", "docker_cluster_pk_ssh_den_auth", diff --git a/tests/test_resources/test_secrets/test_secret.py b/tests/test_resources/test_secrets/test_secret.py index f671df54f..0cfa546f3 100644 --- a/tests/test_resources/test_secrets/test_secret.py +++ b/tests/test_resources/test_secrets/test_secret.py @@ -96,7 +96,7 @@ class TestSecret(tests.test_resources.test_resource.TestResource): "ondemand_k8s_docker_cluster", "ondemand_aws_https_cluster_with_auth", "static_cpu_pwd_cluster", - "multinode_cpu_cluster", + "multinode_cpu_docker_conda_cluster", "docker_cluster_pk_ssh_no_auth", "docker_cluster_pwd_ssh_no_auth", "docker_cluster_pk_ssh_den_auth",