Skip to content

Commit a5cc72e

Browse files
authored
Apply Slurm patch to fix scontrol update nodelist sorting (#1638)
Add steps to apply patches to Slurm tarball Filename pattern expansion is guaranteed to be sorted alphabetically: https://www.gnu.org/software/bash/manual/bash.html#Filename-Expansion Use nullglob bash option to handle case with no patches in patch folder. Add Slurm patch for scontrol update nodelist sorting bug Signed-off-by: Jacopo De Amicis <jdamicis@amazon.it>
1 parent a4d3edf commit a5cc72e

File tree

5 files changed

+164
-0
lines changed

5 files changed

+164
-0
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,12 @@ aws-parallelcluster-cookbook CHANGELOG
33

44
This file is used to list changes made in each version of the AWS ParallelCluster cookbook.
55

6+
3.4.1
7+
-----
8+
9+
**BUG FIXES**
10+
- Fix an issue with the Slurm scheduler that might incorrectly apply updates to its internal registry of compute nodes. This might result in EC2 instances to become inaccessible or backed by an incorrect instance type.
11+
612
3.4.0
713
------
814

cookbooks/aws-parallelcluster-slurm/files/default/install_slurm/slurm_patches/.gitignore

Whitespace-only changes.
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#diff --git a/NEWS b/NEWS
2+
#index ef51ddaee35..b847ab43ee9 100644
3+
#--- a/NEWS
4+
#+++ b/NEWS
5+
#@@ -9,6 +9,8 @@ documents those changes that are of interest to users and administrators.
6+
# ping_nodes() calls all nodes to re-register.
7+
# -- openapi/dbv0.0.3[6-8] - fix segfault that could arrise from Slurm database
8+
# connnection failing.
9+
#+ -- Fix regression introduced in 22.05.0rc1 when updating a NodeName=<nodelist>
10+
#+ with NodeAddr and/or NodeHostname if the specified nodelist wasn't sorted.
11+
#
12+
# * Changes in Slurm 22.05.7
13+
# ==========================
14+
diff --git a/doc/man/man1/scontrol.1 b/doc/man/man1/scontrol.1
15+
index d20c7a768ec..8913345038d 100644
16+
--- a/doc/man/man1/scontrol.1
17+
+++ b/doc/man/man1/scontrol.1
18+
@@ -1,4 +1,4 @@
19+
-.TH scontrol "1" "Slurm Commands" "November 2022" "Slurm Commands"
20+
+.TH scontrol "1" "Slurm Commands" "January 2023" "Slurm Commands"
21+
22+
.SH "NAME"
23+
scontrol \- view or modify Slurm configuration and state.
24+
@@ -474,7 +474,7 @@ unique (e.g. tux2,tux1,tux2 = tux[2,1\-2]). If you wanted a sorted list use
25+
26+
.TP
27+
\fBhostlistsorted\fR
28+
-Takes a list of host names and prints a sorted, unique hostlist
29+
+Takes a list of host names and prints a sorted (but not unique) hostlist
30+
expression for them. See \fBhostlist\fR.
31+
.IP
32+
33+
diff --git a/src/slurmctld/node_mgr.c b/src/slurmctld/node_mgr.c
34+
index ef4b111a595..a86e529c338 100644
35+
--- a/src/slurmctld/node_mgr.c
36+
+++ b/src/slurmctld/node_mgr.c
37+
@@ -1368,14 +1368,18 @@ int update_node(update_node_msg_t *update_node_msg, uid_t auth_uid)
38+
hostlist_t host_list, hostaddr_list = NULL, hostname_list = NULL;
39+
uint32_t base_state = 0, node_flags, state_val;
40+
time_t now = time(NULL);
41+
+ bool uniq = true;
42+
43+
if (update_node_msg->node_names == NULL ) {
44+
info("%s: invalid node name", __func__);
45+
return ESLURM_INVALID_NODE_NAME;
46+
}
47+
48+
+ if (update_node_msg->node_addr || update_node_msg->node_hostname)
49+
+ uniq = false;
50+
+
51+
if (!(host_list = nodespec_to_hostlist(update_node_msg->node_names,
52+
- NULL)))
53+
+ uniq, NULL)))
54+
return ESLURM_INVALID_NODE_NAME;
55+
56+
if (!(node_cnt = hostlist_count(host_list))) {
57+
@@ -4867,7 +4871,7 @@ extern int delete_nodes(char *names, char **err_msg)
58+
59+
lock_slurmctld(write_lock);
60+
61+
- if (!(to_delete = nodespec_to_hostlist(names, NULL))) {
62+
+ if (!(to_delete = nodespec_to_hostlist(names, true, NULL))) {
63+
ret_rc = ESLURM_INVALID_NODE_NAME;
64+
goto cleanup;
65+
}
66+
diff --git a/src/slurmctld/partition_mgr.c b/src/slurmctld/partition_mgr.c
67+
index d13e71d353f..b00d9f3392a 100644
68+
--- a/src/slurmctld/partition_mgr.c
69+
+++ b/src/slurmctld/partition_mgr.c
70+
@@ -192,7 +192,7 @@ extern int build_part_bitmap(part_record_t *part_ptr)
71+
node_record_count - 1);
72+
}
73+
74+
- if (!(host_list = nodespec_to_hostlist(part_ptr->orig_nodes,
75+
+ if (!(host_list = nodespec_to_hostlist(part_ptr->orig_nodes, true,
76+
&part_ptr->nodesets))) {
77+
/* Error, restore original bitmap */
78+
FREE_NULL_BITMAP(part_ptr->node_bitmap);
79+
diff --git a/src/slurmctld/read_config.c b/src/slurmctld/read_config.c
80+
index f3e3a75deed..fb0f4e9a2f8 100644
81+
--- a/src/slurmctld/read_config.c
82+
+++ b/src/slurmctld/read_config.c
83+
@@ -307,7 +307,9 @@ static void _add_nodes_with_feature(hostlist_t hl, char *feature)
84+
}
85+
}
86+
87+
-extern hostlist_t nodespec_to_hostlist(const char *nodes, char **nodesets)
88+
+extern hostlist_t nodespec_to_hostlist(const char *nodes,
89+
+ bool uniq,
90+
+ char **nodesets)
91+
{
92+
int count;
93+
slurm_conf_nodeset_t *ptr, **ptr_array;
94+
@@ -354,7 +356,8 @@ extern hostlist_t nodespec_to_hostlist(const char *nodes, char **nodesets)
95+
}
96+
}
97+
98+
- hostlist_uniq(hl);
99+
+ if (uniq)
100+
+ hostlist_uniq(hl);
101+
return hl;
102+
}
103+
104+
diff --git a/src/slurmctld/slurmctld.h b/src/slurmctld/slurmctld.h
105+
index d254cdda52e..b31a5c43f99 100644
106+
--- a/src/slurmctld/slurmctld.h
107+
+++ b/src/slurmctld/slurmctld.h
108+
@@ -2941,6 +2941,7 @@ extern int update_node_avail_features(char *node_names, char *avail_features,
109+
* Handles node range expressions, nodesets and ALL keyword.
110+
*
111+
* IN nodes - nodelist that can have nodesets or ALL in it.
112+
+ * IN uniq - call hostlist_uniq() before returning the hostlist
113+
* OUT nodesets (optional) - list of nodesets found in nodes string
114+
*
115+
* RET NULL on error, hostlist_t otherwise.
116+
@@ -2948,7 +2949,9 @@ extern int update_node_avail_features(char *node_names, char *avail_features,
117+
* NOTE: Caller must FREE_NULL_HOSTLIST() returned hostlist_t.
118+
* NOTE: Caller should interpret a non-NULL but empty hostlist conveniently.
119+
*/
120+
-extern hostlist_t nodespec_to_hostlist(const char *nodes, char **nodesets);
121+
+extern hostlist_t nodespec_to_hostlist(const char *nodes,
122+
+ bool uniq,
123+
+ char **nodesets);
124+
125+
/*
126+
* set_node_reboot_reason - appropriately set node reason with reboot message
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Slurm patches configuration
2+
3+
This folder is for patch files for the Slurm source code. Patches must be produced by git
4+
(e.g. via `git diff` or by downloading the diff from a commit in GitHub).
5+
6+
Patch files must be prepended with a zero-padded number in order to allow the cookbook to
7+
apply the patches in a defined order. For example:
8+
- 01_hashabc.diff
9+
- 02_hashdef.diff
10+
- 03 hash123.diff

cookbooks/aws-parallelcluster-slurm/recipes/install_slurm.rb

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,14 @@
7272
end
7373
end
7474

75+
# Copy Slurm patches
76+
remote_directory "#{node['cluster']['sources_dir']}/slurm_patches" do
77+
source 'install_slurm/slurm_patches'
78+
mode '0755'
79+
action :create
80+
recursive true
81+
end
82+
7583
# Install Slurm
7684
bash 'make install' do
7785
user 'root'
@@ -85,11 +93,25 @@
8593
8694
tar xf #{slurm_tarball}
8795
cd slurm-slurm-#{node['cluster']['slurm']['version']}
96+
97+
# Apply possible Slurm patches
98+
shopt -s nullglob # with this an empty slurm_patches directory does not trigger the loop
99+
for patch in #{node['cluster']['sources_dir']}/slurm_patches/*.diff; do
100+
echo "Applying patch ${patch}..."
101+
patch --ignore-whitespace -p1 < ${patch}
102+
echo "...DONE."
103+
done
104+
shopt -u nullglob
105+
106+
# Configure Slurm
88107
./configure --prefix=#{node['cluster']['slurm']['install_dir']} --with-pmix=/opt/pmix --with-jwt=/opt/libjwt --enable-slurmrestd
108+
109+
# Build Slurm
89110
CORES=$(grep processor /proc/cpuinfo | wc -l)
90111
make -j $CORES
91112
make install
92113
make install-contrib
114+
93115
deactivate
94116
SLURM
95117
# TODO: Fix, so it works for upgrade

0 commit comments

Comments
 (0)