diff --git a/CHANGELOG.md b/CHANGELOG.md index 7d85df10d..affa31d6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ This file is used to list changes made in each version of the AWS ParallelCluste - Upgrade DCV to version 2024.0-19030. - Remove `berkshelf`. All cookbooks are local and do not need `berkshelf` dependency management. +**BUG FIXES** +- Fix a race condition in CloudWatch Agent startup that could cause nodes bootstrap failures. + 3.13.2 ------ diff --git a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py index 3e2951ae5..653d40b3c 100644 --- a/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py +++ b/cookbooks/aws-parallelcluster-environment/files/cloudwatch/write_cloudwatch_agent_json.py @@ -13,7 +13,7 @@ from cloudwatch_agent_common_utils import render_jinja_template -AWS_CLOUDWATCH_CFG_PATH = "/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json" +AWS_CLOUDWATCH_CFG_PATH = "/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json" DEFAULT_METRICS_COLLECTION_INTERVAL = 60 @@ -45,6 +45,7 @@ def gethostname(): def write_config(config): """Write config to AWS_CLOUDWATCH_CFG_PATH.""" + os.makedirs(os.path.dirname(AWS_CLOUDWATCH_CFG_PATH), exist_ok=True) with open(AWS_CLOUDWATCH_CFG_PATH, "w+", encoding="utf-8") as output_config_file: json.dump(config, output_config_file, indent=4) diff --git a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb index 9118e6e97..5e9ae86f1 100644 --- a/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb +++ b/cookbooks/aws-parallelcluster-environment/resources/cloudwatch/partial/_cloudwatch_common.rb @@ -163,6 +163,8 @@ def package_path command "#{cookbook_virtualenv_path}/bin/python #{validator_script_path}" end unless redhat_on_docker? + CW_AGENT_CONFIG_JSON = '/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json' + execute "cloudwatch-config-creation" do user 'root' timeout 300 @@ -182,6 +184,6 @@ def package_path execute "cloudwatch-agent-start" do user 'root' timeout 300 - command "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s" + command "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:#{CW_AGENT_CONFIG_JSON} -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:#{CW_AGENT_CONFIG_JSON} -s" end unless node['cluster']['cw_logging_enabled'] != 'true' || on_docker? end diff --git a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb index 9c999d242..efd323783 100644 --- a/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb +++ b/cookbooks/aws-parallelcluster-environment/spec/unit/resources/cloudwatch_spec.rb @@ -255,7 +255,7 @@ def self.configure(chef_run) is_expected.to run_execute("cloudwatch-agent-start").with( user: 'root', timeout: 300, - command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s" + command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s" ) end end @@ -313,7 +313,7 @@ def self.configure(chef_run) is_expected.to run_execute("cloudwatch-agent-start").with( user: 'root', timeout: 300, - command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json -s" + command: "/opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a append-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s || /opt/aws/amazon-cloudwatch-agent/bin/amazon-cloudwatch-agent-ctl -a fetch-config -m ec2 -c file:/etc/parallelcluster/amazon-cloudwatch-agent/amazon-cloudwatch-agent.json -s" ) end end