Skip to content

Commit 778d32a

Browse files
Himani Anil Deshpandehimani2411
authored andcommitted
Add NVSwitch device ID for p6 instance type
1 parent 478eb39 commit 778d32a

File tree

2 files changed

+43
-4
lines changed

2 files changed

+43
-4
lines changed

cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,12 @@ def _nvidia_driver_version
5454

5555
# Get number of nv switches
5656
def get_nvswitches
57-
# A100 (P4) and H100(P5) systems have NVSwitches
57+
# A100 (P4), H100(P5) and B200(P6) systems have NVSwitches
5858
# NVSwitch device id is 10de:1af1 for P4 instance
5959
# NVSwitch device id is 10de:22a3 for P5 instance
60-
nvswitch_check_p4 = shell_out("lspci -d 10de:1af1 | wc -l")
61-
nvswitch_check_p5 = shell_out("lspci -d 10de:22a3 | wc -l")
62-
nvswitch_check_p4.stdout.strip.to_i + nvswitch_check_p5.stdout.strip.to_i
60+
# NVSwitch device id is 10de:2901 for P6 instance
61+
# We sum the count for all these deviceIds as output of lscpi command will be >0
62+
# for only one device ID based on the instance type
63+
nvswitch_device_ids = ['10de:1af1', '10de:22a3', '10de:2901']
64+
nvswitch_device_ids.sum { |id| shell_out("lspci -d #{id} | wc -l").stdout.strip.to_i }
6365
end

cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,40 @@ def self.configure(chef_run)
260260
end
261261
end
262262
end
263+
264+
describe 'fabric_manager:get_nvswitches' do
265+
cached(:chef_run) do
266+
ChefSpec::SoloRunner.new(step_into: ['fabric_manager'])
267+
end
268+
269+
let(:output_of_shell) { double('shell_out') }
270+
cached(:resource) do
271+
ConvergeFabricManager.setup(chef_run)
272+
chef_run.find_resource('fabric_manager', 'setup')
273+
end
274+
275+
before do
276+
allow(resource).to receive(:shell_out).and_return(output_of_shell)
277+
end
278+
279+
context 'when count of NVSwitches > 1' do
280+
it 'correctly counts multiple NVSwitches' do
281+
allow(output_of_shell).to receive(:stdout).and_return("2\n", "0\n", "0\n")
282+
expect(resource.get_nvswitches).to eq(2)
283+
end
284+
end
285+
286+
context 'when count of NVSwitches == 0' do
287+
it 'returns zero when no NVSwitches are found' do
288+
allow(output_of_shell).to receive(:stdout).and_return("0\n")
289+
expect(resource.get_nvswitches).to eq(0)
290+
end
291+
end
292+
293+
context 'when count of NVSwitches gives unexpected output' do
294+
it 'handles non-numeric output' do
295+
allow(output_of_shell).to receive(:stdout).and_return("error\n")
296+
expect(resource.get_nvswitches).to eq(0)
297+
end
298+
end
299+
end

0 commit comments

Comments
 (0)