File tree Expand file tree Collapse file tree 2 files changed +43
-4
lines changed
cookbooks/aws-parallelcluster-platform
resources/fabric_manager/partial Expand file tree Collapse file tree 2 files changed +43
-4
lines changed Original file line number Diff line number Diff line change @@ -54,10 +54,12 @@ def _nvidia_driver_version
54
54
55
55
# Get number of nv switches
56
56
def get_nvswitches
57
- # A100 (P4) and H100(P5) systems have NVSwitches
57
+ # A100 (P4), H100(P5) and B200(P6 ) systems have NVSwitches
58
58
# NVSwitch device id is 10de:1af1 for P4 instance
59
59
# NVSwitch device id is 10de:22a3 for P5 instance
60
- nvswitch_check_p4 = shell_out ( "lspci -d 10de:1af1 | wc -l" )
61
- nvswitch_check_p5 = shell_out ( "lspci -d 10de:22a3 | wc -l" )
62
- nvswitch_check_p4 . stdout . strip . to_i + nvswitch_check_p5 . stdout . strip . to_i
60
+ # NVSwitch device id is 10de:2901 for P6 instance
61
+ # We sum the count for all these deviceIds as output of lscpi command will be >0
62
+ # for only one device ID based on the instance type
63
+ nvswitch_device_ids = [ '10de:1af1' , '10de:22a3' , '10de:2901' ]
64
+ nvswitch_device_ids . sum { |id | shell_out ( "lspci -d #{ id } | wc -l" ) . stdout . strip . to_i }
63
65
end
Original file line number Diff line number Diff line change @@ -260,3 +260,40 @@ def self.configure(chef_run)
260
260
end
261
261
end
262
262
end
263
+
264
+ describe 'fabric_manager:get_nvswitches' do
265
+ cached ( :chef_run ) do
266
+ ChefSpec ::SoloRunner . new ( step_into : [ 'fabric_manager' ] )
267
+ end
268
+
269
+ let ( :output_of_shell ) { double ( 'shell_out' ) }
270
+ cached ( :resource ) do
271
+ ConvergeFabricManager . setup ( chef_run )
272
+ chef_run . find_resource ( 'fabric_manager' , 'setup' )
273
+ end
274
+
275
+ before do
276
+ allow ( resource ) . to receive ( :shell_out ) . and_return ( output_of_shell )
277
+ end
278
+
279
+ context 'when count of NVSwitches > 1' do
280
+ it 'correctly counts multiple NVSwitches' do
281
+ allow ( output_of_shell ) . to receive ( :stdout ) . and_return ( "2\n " , "0\n " , "0\n " )
282
+ expect ( resource . get_nvswitches ) . to eq ( 2 )
283
+ end
284
+ end
285
+
286
+ context 'when count of NVSwitches == 0' do
287
+ it 'returns zero when no NVSwitches are found' do
288
+ allow ( output_of_shell ) . to receive ( :stdout ) . and_return ( "0\n " )
289
+ expect ( resource . get_nvswitches ) . to eq ( 0 )
290
+ end
291
+ end
292
+
293
+ context 'when count of NVSwitches gives unexpected output' do
294
+ it 'handles non-numeric output' do
295
+ allow ( output_of_shell ) . to receive ( :stdout ) . and_return ( "error\n " )
296
+ expect ( resource . get_nvswitches ) . to eq ( 0 )
297
+ end
298
+ end
299
+ end
You can’t perform that action at this time.
0 commit comments