From b91084f08632b6215aa296b9945cd052c76a6522 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 3 Jul 2025 10:01:47 -0400 Subject: [PATCH] Add NVSwitch device ID for p6 instance type --- .../partial/_fabric_manager_common.rb | 10 +++-- .../unit/resources/fabric_manager_spec.rb | 37 +++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb index 027766f98..057ac80c1 100644 --- a/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb +++ b/cookbooks/aws-parallelcluster-platform/resources/fabric_manager/partial/_fabric_manager_common.rb @@ -54,10 +54,12 @@ def _nvidia_driver_version # Get number of nv switches def get_nvswitches - # A100 (P4) and H100(P5) systems have NVSwitches + # A100 (P4), H100(P5) and B200(P6) systems have NVSwitches # NVSwitch device id is 10de:1af1 for P4 instance # NVSwitch device id is 10de:22a3 for P5 instance - nvswitch_check_p4 = shell_out("lspci -d 10de:1af1 | wc -l") - nvswitch_check_p5 = shell_out("lspci -d 10de:22a3 | wc -l") - nvswitch_check_p4.stdout.strip.to_i + nvswitch_check_p5.stdout.strip.to_i + # NVSwitch device id is 10de:2901 for P6 instance + # We sum the count for all these deviceIds as output of lscpi command will be >0 + # for only one device ID based on the instance type + nvswitch_device_ids = ['10de:1af1', '10de:22a3', '10de:2901'] + nvswitch_device_ids.sum { |id| shell_out("lspci -d #{id} | wc -l").stdout.strip.to_i } end diff --git a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb index 11df489b8..0ef02046f 100644 --- a/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb +++ b/cookbooks/aws-parallelcluster-platform/spec/unit/resources/fabric_manager_spec.rb @@ -260,3 +260,40 @@ def self.configure(chef_run) end end end + +describe 'fabric_manager:get_nvswitches' do + cached(:chef_run) do + ChefSpec::SoloRunner.new(step_into: ['fabric_manager']) + end + + let(:output_of_shell) { double('shell_out') } + cached(:resource) do + ConvergeFabricManager.setup(chef_run) + chef_run.find_resource('fabric_manager', 'setup') + end + + before do + allow(resource).to receive(:shell_out).and_return(output_of_shell) + end + + context 'when count of NVSwitches > 1' do + it 'correctly counts multiple NVSwitches' do + allow(output_of_shell).to receive(:stdout).and_return("2\n", "0\n", "0\n") + expect(resource.get_nvswitches).to eq(2) + end + end + + context 'when count of NVSwitches == 0' do + it 'returns zero when no NVSwitches are found' do + allow(output_of_shell).to receive(:stdout).and_return("0\n") + expect(resource.get_nvswitches).to eq(0) + end + end + + context 'when count of NVSwitches gives unexpected output' do + it 'handles non-numeric output' do + allow(output_of_shell).to receive(:stdout).and_return("error\n") + expect(resource.get_nvswitches).to eq(0) + end + end +end