From 81c9d4e88cfd745275d3555471442fd2cc1a51c3 Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Tue, 15 Jul 2025 10:27:28 -0400 Subject: [PATCH] [Logs] Move the CW Agent configuration authored by ParallelCluster to `/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json`. This is to prevent a race condition in the way we start the CW agent, that may lead to undesired deletion of the config file and eventually the node bootstrap failure caused by CW agent failing to start. --- CHANGELOG.md | 3 +++ .../files/cloudwatch/write_cloudwatch_agent_json.py | 3 ++- .../resources/cloudwatch/partial/_cloudwatch_common.rb | 4 +++- .../spec/unit/resources/cloudwatch_spec.rb | 4 ++-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d85df10d5..affa31d6aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. +**BUG FIXES** +- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures. + 3.13.2 ------ diff --git a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py index 3e2951ae52..653d40b3ca 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py +++ b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py @@ -13,7 +13,7 @@ from cloudwatch_agent_common_utils import render_jinja_template -AWS_CLOUDWATCH_CFG_PATH = "/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json" +AWS_CLOUDWATCH_CFG_PATH = "/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json" DEFAULT_METRICS_COLLECTION_INTERVAL = 60 @@ -45,6 +45,7 @@ def gethostname(): def write_config(config): """Write config to AWS_CLOUDWATCH_CFG_PATH.""" + os.makedirs(os.path.dirname(AWS_CLOUDWATCH_CFG_PATH), exist_ok=True) with open(AWS_CLOUDWATCH_CFG_PATH, "w+", encoding="utf-8") as output_config_file: json.dump(config, output_config_file, indent=4) diff --git a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb index 9118e6e978..5e9ae86f18 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb @@ -163,6 +163,8 @@ def package_path command "#{cookbook_virtualenv_path}/bin/python #{validator_script_path}" end unless redhat_on_docker? + CW_AGENT_CONFIG_JSON = '/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json' + execute "cloudwatch-config-creation" do user 'root' timeout 300 @@ -182,6 +184,6 @@ def package_path execute "cloudwatch-agent-start" do user 'root' timeout 300 - command "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s" + command "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:#{CW_AGENT_CONFIG_JSON} -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:#{CW_AGENT_CONFIG_JSON} -s" end unless node['cluster']['cw_logging_enabled'] != 'true' || on_docker? end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb index 9c999d242b..efd3237831 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb @@ -255,7 +255,7 @@ def self.configure(chef_run) is_expected.to run_execute("cloudwatch-agent-start").with( user: 'root', timeout: 300, - command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s" + command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s" ) end end @@ -313,7 +313,7 @@ def self.configure(chef_run) is_expected.to run_execute("cloudwatch-agent-start").with( user: 'root', timeout: 300, - command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s" + command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s" ) end end