Skip to content

Commit e94e9c2

Browse files
Merge pull request #338 from aws/develop
Merge Release 2.4.0
2 parents 7568a23 + 4afb7d6 commit e94e9c2

38 files changed

+1251
-199
lines changed

.kitchen.cloud.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
driver_config:
33
retryable_sleep: 15
44
retryable_tries: 20
5+
retry_limit: 6
56
aws_ssh_key_id: <%= ENV['AWS_KEYPAIR_NAME'] %>
67
region: <%= ENV['AWS_DEFAULT_REGION'] %>
78
instance_type: <%= ENV['AWS_FLAVOR_ID'] %>
@@ -16,6 +17,8 @@ driver_config:
1617
provisioner:
1718
name: chef_zero
1819
require_chef_omnibus: 14.2.0
20+
# use custom chef install URL to cope with issue https://github.com/chef/bento/issues/609
21+
chef_omnibus_url: https://github.com/aws/aws-parallelcluster-cookbook/develop/util/chef-install.sh
1922
retry_on_exit_code:
2023
- 35 # 35 is the exit code signaling that the node is rebooting
2124
max_retries: 1

.kitchen.yml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,11 +68,14 @@ suites:
6868
cfn_ephemeral_dir: <%= ENV['CFN_EPHEMERAL_DIR'] %>
6969
cfn_shared_dir: <%= ENV['CFN_SHARED_DIR'] %>
7070
cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %>
71+
custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %>
72+
os: <%= ENV['OS'] %>
7173

7274
- name: sge_config_MasterServer
7375
run_list:
7476
- recipe[aws-parallelcluster::_prep_env]
7577
- recipe[aws-parallelcluster::sge_config]
78+
- recipe[aws-parallelcluster::finalize]
7679
- recipe[aws-parallelcluster::tests]
7780
attributes:
7881
cfncluster:
@@ -87,11 +90,14 @@ suites:
8790
cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %>
8891
cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %>
8992
cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %>
93+
custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %>
94+
os: <%= ENV['OS'] %>
9095

9196
- name: torque_config_MasterServer
9297
run_list:
9398
- recipe[aws-parallelcluster::_prep_env]
9499
- recipe[aws-parallelcluster::torque_config]
100+
- recipe[aws-parallelcluster::finalize]
95101
- recipe[aws-parallelcluster::tests]
96102
attributes:
97103
cfncluster:
@@ -106,11 +112,14 @@ suites:
106112
cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %>
107113
cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %>
108114
cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %>
115+
custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %>
116+
os: <%= ENV['OS'] %>
109117

110118
- name: slurm_config_MasterServer
111119
run_list:
112120
- recipe[aws-parallelcluster::_prep_env]
113121
- recipe[aws-parallelcluster::slurm_config]
122+
- recipe[aws-parallelcluster::finalize]
114123
- recipe[aws-parallelcluster::tests]
115124
attributes:
116125
cfncluster:
@@ -125,11 +134,14 @@ suites:
125134
cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %>
126135
cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %>
127136
cfn_ddb_table: <%= ENV['CFN_DDB_TABLE'] %>
137+
custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %>
138+
os: <%= ENV['OS'] %>
128139

129140
- name: sge_config_ComputeFleet
130141
run_list:
131142
- recipe[aws-parallelcluster::_prep_env]
132143
- recipe[aws-parallelcluster::sge_config]
144+
- recipe[aws-parallelcluster::finalize]
133145
- recipe[aws-parallelcluster::tests]
134146
attributes:
135147
cfncluster:
@@ -144,11 +156,14 @@ suites:
144156
cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %>
145157
cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %>
146158
cfn_master: <%= ENV['CFN_MASTER'] %>
159+
custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %>
160+
os: <%= ENV['OS'] %>
147161

148162
- name: torque_config_ComputeFleet
149163
run_list:
150164
- recipe[aws-parallelcluster::_prep_env]
151165
- recipe[aws-parallelcluster::torque_config]
166+
- recipe[aws-parallelcluster::finalize]
152167
- recipe[aws-parallelcluster::tests]
153168
attributes:
154169
cfncluster:
@@ -163,11 +178,14 @@ suites:
163178
cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %>
164179
cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %>
165180
cfn_master: <%= ENV['CFN_MASTER'] %>
181+
custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %>
182+
os: <%= ENV['OS'] %>
166183

167184
- name: slurm_config_ComputeFleet
168185
run_list:
169186
- recipe[aws-parallelcluster::_prep_env]
170187
- recipe[aws-parallelcluster::slurm_config]
188+
- recipe[aws-parallelcluster::finalize]
171189
- recipe[aws-parallelcluster::tests]
172190
attributes:
173191
cfncluster:
@@ -182,3 +200,5 @@ suites:
182200
cfn_cluster_user: <%= ENV['CFN_CLUSTER_USER'] %>
183201
cfn_sqs_queue: <%= ENV['CFN_SQS_QUEUE'] %>
184202
cfn_master: <%= ENV['CFN_MASTER'] %>
203+
custom_node_package: <%= ENV['PARALLELCLUSTER_NODE_URL'] %>
204+
os: <%= ENV['OS'] %>

CHANGELOG.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,36 @@ aws-parallelcluster-cookbook CHANGELOG
33

44
This file is used to list changes made in each version of the AWS ParallelCluster cookbook.
55

6+
2.4.0
7+
-----
8+
9+
**ENHANCEMENTS**
10+
- Add support for EFA on Centos 7, Amazon Linux and Ubuntu 1604
11+
- Add support for Ubuntu in China region `cn-northwest-1`
12+
13+
**CHANGES**
14+
- SGE: changed following parameters in global configuration
15+
- `max_unheard 00:03:00`: allows a faster reaction in case of faulty nodes
16+
- `reschedule_unknown 00:00:30`: enables rescheduling of jobs running on failing nodes
17+
- `qmaster_params ENABLE_FORCED_QDEL_IF_UNKNOWN`: forces job deletion on unresponsive nodes
18+
- `qmaster_params ENABLE_RESCHEDULE_KILL`: forces rescheduling or killing of jobs running on failing nodes
19+
- Slurm: decrease SlurmdTimeout to 120 seconds to speed up replacement of faulty nodes
20+
- Always use full master FQDN when mounting NFS on compute nodes. This solves some issues occurring with some networking
21+
setups and custom DNS configurations
22+
- Set soft and hard ulimit on open files to 10000 for all supported OSs
23+
- Pin python `supervisor` version to 3.4.0
24+
- Remove unused `compute_instance_type` from jobwatcher.cfg
25+
- Removed unused `max_queue_size` from sqswatcher.cfg
26+
- Remove double quoting of the post_install args
27+
28+
**BUG FIXES**
29+
- Fix issue that was preventing Torque from being used on Centos 7
30+
- Start node daemons at the end of instance initialization. The time spent for post-install script and node
31+
initialization is not counted as part of node idletime anymore.
32+
- Fix issue which was causing an additional and invalid EBS mount point to be added in case of multiple EBS
33+
- Install Slurm libpmpi/libpmpi2 that is distributed in a separate package since Slurm 17
34+
35+
636
2.3.1
737
-----
838

amis/packer_alinux.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@
210210
"pause_before": "2m",
211211
"json" : {
212212
"cfncluster" : {
213+
"cfn_region": "{{user `region`}}",
213214
"nvidia" : {
214215
"enabled" : "{{user `nvidia_enabled`}}"
215216
},
@@ -246,7 +247,6 @@
246247
},
247248
{
248249
"type" : "shell",
249-
"only": ["custom-alinux"],
250250
"inline" : [
251251
"sudo /usr/local/sbin/ami_cleanup.sh"
252252
]

amis/packer_centos6.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@
219219
"pause_before": "2m",
220220
"json" : {
221221
"cfncluster" : {
222+
"cfn_region": "{{user `region`}}",
222223
"nvidia" : {
223224
"enabled" : "{{user `nvidia_enabled`}}"
224225
},
@@ -251,7 +252,7 @@
251252
"inline" : [
252253
"region=\"{{user `region`}}\"",
253254
"bucket=\"s3.amazonaws.com\"",
254-
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"",
255+
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"",
255256
"curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz",
256257
"sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz"
257258
]
@@ -265,7 +266,6 @@
265266
},
266267
{
267268
"type" : "shell",
268-
"only": ["custom-centos6"],
269269
"inline" : [
270270
"sudo /usr/local/sbin/ami_cleanup.sh"
271271
]

amis/packer_centos7.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@
224224
"pause_before": "2m",
225225
"json" : {
226226
"cfncluster" : {
227+
"cfn_region": "{{user `region`}}",
227228
"nvidia" : {
228229
"enabled" : "{{user `nvidia_enabled`}}"
229230
},
@@ -256,7 +257,7 @@
256257
"inline" : [
257258
"region=\"{{user `region`}}\"",
258259
"bucket=\"s3.amazonaws.com\"",
259-
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"",
260+
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"",
260261
"curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz",
261262
"which pip2",
262263
"if [ $? -eq 0 ]; then sudo pip2 install /tmp/aws-cfn-bootstrap-latest.tar.gz; else sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz; fi"
@@ -277,7 +278,6 @@
277278
},
278279
{
279280
"type" : "shell",
280-
"only": ["custom-centos7"],
281281
"inline" : [
282282
"sudo /usr/local/sbin/ami_cleanup.sh"
283283
]

amis/packer_ubuntu1404.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@
224224
"pause_before": "2m",
225225
"json" : {
226226
"cfncluster" : {
227+
"cfn_region": "{{user `region`}}",
227228
"nvidia" : {
228229
"enabled" : "{{user `nvidia_enabled`}}"
229230
},
@@ -257,7 +258,7 @@
257258
"inline" : [
258259
"region=\"{{user `region`}}\"",
259260
"bucket=\"s3.amazonaws.com\"",
260-
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"",
261+
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"",
261262
"curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz",
262263
"sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz"
263264
]
@@ -271,7 +272,6 @@
271272
},
272273
{
273274
"type" : "shell",
274-
"only": ["custom-ubuntu1404"],
275275
"inline" : [
276276
"sudo /usr/local/sbin/ami_cleanup.sh"
277277
]

amis/packer_ubuntu1604.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@
227227
"pause_before": "2m",
228228
"json" : {
229229
"cfncluster" : {
230+
"cfn_region": "{{user `region`}}",
230231
"nvidia" : {
231232
"enabled" : "{{user `nvidia_enabled`}}"
232233
},
@@ -260,7 +261,7 @@
260261
"inline" : [
261262
"region=\"{{user `region`}}\"",
262263
"bucket=\"s3.amazonaws.com\"",
263-
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn\"",
264+
"[[ ${region} =~ ^cn- ]] && bucket=\"s3.cn-north-1.amazonaws.com.cn/cn-north-1-aws-parallelcluster\"",
264265
"curl --retry 3 -L -o /tmp/aws-cfn-bootstrap-latest.tar.gz https://${bucket}/cloudformation-examples/aws-cfn-bootstrap-latest.tar.gz",
265266
"sudo pip install /tmp/aws-cfn-bootstrap-latest.tar.gz"
266267
]
@@ -274,7 +275,6 @@
274275
},
275276
{
276277
"type" : "shell",
277-
"only": ["custom-ubuntu1604"],
278278
"inline" : [
279279
"sudo /usr/local/sbin/ami_cleanup.sh"
280280
]

amis/packer_variables.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"parallelcluster_version": "2.3.1",
3-
"parallelcluster_cookbook_version": "2.3.1",
2+
"parallelcluster_version": "2.4.0",
3+
"parallelcluster_cookbook_version": "2.4.0",
44
"chef_version": "14.2.0",
55
"ridley_version": "5.1.1",
66
"berkshelf_version": "7.0.4"

attributes/default.rb

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
default['cfncluster']['scripts_dir'] = "#{node['cfncluster']['base_dir']}/scripts"
2020
default['cfncluster']['license_dir'] = "#{node['cfncluster']['base_dir']}/licenses"
2121
# Python packages
22-
default['cfncluster']['cfncluster-version'] = '2.3.1'
23-
default['cfncluster']['cfncluster-node-version'] = '2.3.1'
24-
default['cfncluster']['cfncluster-supervisor-version'] = '3.3.1'
22+
default['cfncluster']['cfncluster-version'] = '2.4.0'
23+
default['cfncluster']['cfncluster-node-version'] = '2.4.0'
24+
default['cfncluster']['supervisor-version'] = '3.4.0'
2525
# URLs to software packages used during install recipes
2626
# Gridengine software
2727
default['cfncluster']['sge']['version'] = '8.1.9'
@@ -45,6 +45,8 @@
4545
default['cfncluster']['nvidia']['enabled'] = 'no'
4646
default['cfncluster']['nvidia']['driver_url'] = 'http://download.nvidia.com/XFree86/Linux-x86_64/418.56/NVIDIA-Linux-x86_64-418.56.run'
4747
default['cfncluster']['nvidia']['cuda_url'] = 'https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux'
48+
# EFA
49+
default['cfncluster']['efa']['installer_url'] = 'https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-latest.tar.gz'
4850

4951
# Reboot after default_pre recipe
5052
default['cfncluster']['default_pre_reboot'] = 'true'
@@ -83,7 +85,7 @@
8385
if node['platform_version'].to_i >= 7
8486
default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel
8587
libXmu-devel hwloc-devel libdb-devel tcl-devel automake autoconf pyparted libtool
86-
httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel openmpi-devel R atlas-devel
88+
httpd boost-devel redhat-lsb mlocate lvm2 mpich-devel R atlas-devel
8789
blas-devel fftw-devel libffi-devel openssl-devel dkms mariadb-devel libedit-devel
8890
libical-devel postgresql-devel postgresql-server sendmail libxml2-devel libglvnd-devel mdadm]
8991
if node['platform_version'].split('.')[1] == '6'
@@ -105,7 +107,7 @@
105107
when 'amazon'
106108
default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh openssl-devel ncurses-devel pam-devel net-tools openmotif-devel
107109
libXmu-devel hwloc-devel db4-devel tcl-devel automake autoconf pyparted libtool
108-
httpd boost-devel redhat-lsb mlocate mpich-devel openmpi-devel R atlas-devel fftw-devel
110+
httpd boost-devel redhat-lsb mlocate mpich-devel R atlas-devel fftw-devel
109111
libffi-devel openssl-devel dkms mysql-devel libedit-devel postgresql-devel postgresql-server
110112
sendmail cmake byacc libglvnd-devel mdadm]
111113
end
@@ -123,8 +125,11 @@
123125
default['cfncluster']['base_packages'] = %w[vim ksh tcsh zsh libssl-dev ncurses-dev libpam-dev net-tools libhwloc-dev dkms
124126
tcl-dev automake autoconf python-parted libtool librrd-dev libapr1-dev libconfuse-dev
125127
apache2 libboost-dev libdb-dev tcsh libssl-dev libncurses5-dev libpam0g-dev libxt-dev
126-
libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev libopenmpi-dev
128+
libmotif-dev libxmu-dev libxft-dev libhwloc-dev man-db lvm2 libmpich-dev
127129
r-base libatlas-dev libblas-dev libfftw3-dev libffi-dev libssl-dev libxml2-dev mdadm]
130+
if node['platform_version'] == '14.04'
131+
default['cfncluster']['base_packages'].push('libopenmpi-dev')
132+
end
128133
default['cfncluster']['kernel_generic_pkg'] = "linux-generic"
129134
default['cfncluster']['kernel_extra_pkg'] = "linux-image-extra-#{node['kernel']['release']}"
130135
default['cfncluster']['ganglia']['apache_user'] = 'www-data'
@@ -166,7 +171,6 @@
166171
default['cfncluster']['cfn_shared_dir'] = '/shared'
167172
default['cfncluster']['cfn_efs_shared_dir'] = 'NONE'
168173
default['cfncluster']['cfn_efs'] = nil
169-
default['cfncluster']['cfn_node_type'] = nil
170174
default['cfncluster']['cfn_master'] = nil
171175
default['cfncluster']['cfn_cluster_user'] = 'ec2-user'
172176
default['cfncluster']['cfn_fsx_options'] = 'NONE'

0 commit comments

Comments
 (0)