From 45f122928b89f8183a519e65e7a4f86b8a998314 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Tue, 18 Feb 2020 13:27:26 +0200 Subject: [PATCH 01/25] [thermal control] Fix pmon docker stop issue on 3800 Conflicts: device/mellanox/x86_64-mlnx_msn2700-r0/plugins/sfputil.py platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py --- platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 462267951130..e1b6ecab755c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -64,7 +64,8 @@ } thermal_api_handler_gearbox = { THERMAL_API_GET_TEMPERATURE:"gearbox{}_temp_input", - THERMAL_API_GET_HIGH_THRESHOLD:None + THERMAL_API_GET_HIGH_THRESHOLD:None, + THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD:None } thermal_ambient_apis = { THERMAL_DEV_ASIC_AMBIENT : "asic", From 23bf172fa427701a4fcc109192f02f8cb38700e3 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Fri, 21 Feb 2020 03:37:34 +0200 Subject: [PATCH 02/25] [thermal fix] Fix QA test issue Conflicts: platform/mellanox/mlnx-platform-api/sonic_platform/psu.py --- .../mlnx-platform-api/sonic_platform/fan.py | 6 +++--- .../mlnx-platform-api/sonic_platform/psu.py | 10 +++++----- .../mlnx-platform-api/sonic_platform/thermal.py | 16 +++++++++++++++- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index cc4f8e81d9b5..f16d59f817cc 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -107,15 +107,15 @@ def get_status(self): """ status = 0 if self.is_psu_fan: - status = 1 + status = 0 else: try: with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status: status = int(fault_status.read()) except (ValueError, IOError): - status = 0 + status = 1 - return status == 1 + return status == 0 def get_presence(self): diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 31887ddf4f25..551e384988f9 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -100,11 +100,7 @@ def __init__(self, psu_index, sku): self.psu_presence = psu_presence fan = Fan(sku, psu_index, psu_index, True) - if fan.get_presence(): - self._fan_list.append(fan) - - def get_name(self): - return self._name + self._fan_list.append(fan) self.psu_green_led_path = "led_psu_green" self.psu_red_led_path = "led_psu_red" @@ -112,6 +108,10 @@ def get_name(self): self.psu_led_cap_path = "led_psu_capability" + def get_name(self): + return self._name + + def _read_generic_file(self, filename, len): """ Read a generic file, returns the contents of the file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index e1b6ecab755c..9cb715a55ed2 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -350,7 +350,7 @@ def get_temperature(self): hint = self.dependent_hint else: hint = "unknown reason" - logger.log_info("get_temperature for {} failed due to {}".format(self.name, hint)) + logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint)) return None value_str = self._read_generic_file(self.temperature, 0) if value_str is None: @@ -371,6 +371,13 @@ def get_high_threshold(self): """ if self.high_threshold is None: return None + if self.dependency and not self.dependency(): + if self.dependent_hint: + hint = self.dependent_hint + else: + hint = "unknown reason" + logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_threshold, 0) if value_str is None: return None @@ -390,6 +397,13 @@ def get_high_critical_threshold(self): """ if self.high_critical_threshold is None: return None + if self.dependency and not self.dependency(): + if self.dependent_hint: + hint = self.dependent_hint + else: + hint = "unknown reason" + logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_critical_threshold, 0) if value_str is None: return None From a1aaa934d6fd146b3ae2afcb3f911af0f538ba72 Mon Sep 17 00:00:00 2001 From: Stephen Sun <5379172+stephenxs@users.noreply.github.com> Date: Thu, 5 Dec 2019 03:40:42 +0800 Subject: [PATCH 03/25] Fix thermal control issues Conflicts: platform/mellanox/mlnx-platform-api/sonic_platform/psu.py --- .../mlnx-platform-api/sonic_platform/fan.py | 4 +- .../mlnx-platform-api/sonic_platform/psu.py | 18 ++++++++ .../sonic_platform/thermal.py | 41 ++++++++----------- .../sonic_platform/thermal_infos.py | 4 +- .../tests/duplicate_action.json | 18 ++++++++ .../tests/duplicate_condition.json | 17 ++++++++ .../mlnx-platform-api/tests/empty_action.json | 10 +++++ .../tests/empty_condition.json | 11 +++++ .../mlnx-platform-api/tests/mock_platform.py | 4 ++ .../tests/test_thermal_policy.py | 38 +++++++++++++++++ 10 files changed, 138 insertions(+), 27 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/tests/duplicate_action.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/empty_action.json create mode 100644 platform/mellanox/mlnx-platform-api/tests/empty_condition.json diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index f16d59f817cc..b212e5e12db4 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -47,7 +47,7 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False): else: self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index) self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) - self._name = 'psu_{}_fan_{}'.format(self.index, fan_index) + self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) @@ -183,6 +183,8 @@ def get_speed(self): max_speed_in_rpm = self._get_max_speed_in_rpm() speed = 100*speed_in_rpm/max_speed_in_rpm + if speed > 100: + speed = 100 return speed diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 551e384988f9..c412b0d0fd77 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -287,3 +287,21 @@ def get_status_led(self): raise RuntimeError("Failed to read led status for psu due to {}".format(repr(e))) return self.STATUS_LED_COLOR_OFF + + + def _get_power_available_status(self): + """ + Gets the power available status + + Returns: + True if power is present and power on. + False and "power not present" if power is not present. + False and "power off" if power is present but not power on. + """ + if not self.get_presence(): + return False, "power not present" + elif not self.get_powergood_status(): + return False, "power off" + else: + return True, "" + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 9cb715a55ed2..112cf29aa0ad 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -264,7 +264,7 @@ def initialize_thermals(sku, thermal_list, psu_list): else: if category == THERMAL_DEV_CATEGORY_PSU: for index in range(count): - thermal = Thermal(category, start + index, True, psu_list[index].get_powergood_status, "power off") + thermal = Thermal(category, start + index, True, psu_list[index]._get_power_available_status) thermal_list.append(thermal) else: for index in range(count): @@ -274,7 +274,7 @@ def initialize_thermals(sku, thermal_list, psu_list): class Thermal(ThermalBase): - def __init__(self, category, index, has_index, dependency = None, hint = None): + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories """ @@ -293,7 +293,6 @@ def __init__(self, category, index, has_index, dependency = None, hint = None): self.high_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_THRESHOLD) self.high_critical_threshold = self._get_file_from_api(THERMAL_API_GET_HIGH_CRITICAL_THRESHOLD) self.dependency = dependency - self.dependent_hint = hint def get_name(self): @@ -345,13 +344,11 @@ def get_temperature(self): A float number of current temperature in Celsius up to nearest thousandth of one degree Celsius, e.g. 30.125 """ - if self.dependency and not self.dependency(): - if self.dependent_hint: - hint = self.dependent_hint - else: - hint = "unknown reason" - logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint)) - return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_temperature for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.temperature, 0) if value_str is None: return None @@ -371,13 +368,11 @@ def get_high_threshold(self): """ if self.high_threshold is None: return None - if self.dependency and not self.dependency(): - if self.dependent_hint: - hint = self.dependent_hint - else: - hint = "unknown reason" - logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint)) - return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_threshold, 0) if value_str is None: return None @@ -397,13 +392,11 @@ def get_high_critical_threshold(self): """ if self.high_critical_threshold is None: return None - if self.dependency and not self.dependency(): - if self.dependent_hint: - hint = self.dependent_hint - else: - hint = "unknown reason" - logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) - return None + if self.dependency: + status, hint = self.dependency() + if not status: + logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint)) + return None value_str = self._read_generic_file(self.high_critical_threshold, 0) if value_str is None: return None diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py index 34d31e47d24c..82c186495f5e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -77,12 +77,12 @@ def collect(self, chassis): """ self._status_changed = False for psu in chassis.get_all_psus(): - if psu.get_presence() and psu not in self._presence_psus: + if psu.get_presence() and psu.get_powergood_status() and psu not in self._presence_psus: self._presence_psus.add(psu) self._status_changed = True if psu in self._absence_psus: self._absence_psus.remove(psu) - elif not psu.get_presence() and psu not in self._absence_psus: + elif (not psu.get_presence() or not psu.get_powergood_status()) and psu not in self._absence_psus: self._absence_psus.add(psu) self._status_changed = True if psu in self._presence_psus: diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json new file mode 100644 index 000000000000..c19787aa26e0 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_action.json @@ -0,0 +1,18 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json new file mode 100644 index 000000000000..c25d84762e2a --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/duplicate_condition.json @@ -0,0 +1,17 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + }, + { + "type": "fan.any.absence" + } + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_action.json b/platform/mellanox/mlnx-platform-api/tests/empty_action.json new file mode 100644 index 000000000000..b1051b5a6f60 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_action.json @@ -0,0 +1,10 @@ +{ + "name": "any fan absence", + "conditions": [ + { + "type": "fan.any.absence" + } + ], + "actions": [ + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/empty_condition.json b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json new file mode 100644 index 000000000000..e7a588459246 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/empty_condition.json @@ -0,0 +1,11 @@ +{ + "name": "any fan absence", + "conditions": [ + ], + "actions": [ + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index b8d070d44955..f34ace97968d 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -13,10 +13,14 @@ def set_speed(self, speed): class MockPsu: def __init__(self): self.presence = True + self.powergood = True def get_presence(self): return self.presence + def get_powergood_status(self): + return self.powergood + class MockChassis: def __init__(self): diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index ba9e502d4f74..d0a8447c5e0c 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -66,6 +66,12 @@ def test_psu_info(): assert len(psu_info.get_presence_psus()) == 1 assert psu_info.is_status_changed() + psu_list[0].powergood = False + psu_info.collect(chassis) + assert len(psu_info.get_absence_psus()) == 1 + assert len(psu_info.get_presence_psus()) == 0 + assert psu_info.is_status_changed() + def test_fan_policy(thermal_manager): chassis = MockChassis() @@ -269,4 +275,36 @@ def test_load_control_thermal_algo_action(): with pytest.raises(ValueError): action.load_from_json(json_obj) +def test_load_duplicate_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_duplicate_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'duplicate_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_condition(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_condition.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + +def test_load_empty_action(): + from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy + with open(os.path.join(test_path, 'empty_action.json')) as f: + json_obj = json.load(f) + policy = ThermalPolicy() + with pytest.raises(Exception): + policy.load_from_json(json_obj) + From 7f341f93ae560a0d790e59c57bb40ea1121f93b0 Mon Sep 17 00:00:00 2001 From: junchao Date: Tue, 25 Feb 2020 13:08:00 +0800 Subject: [PATCH 04/25] [thermal fix] change psu._get_power_available_status to psu.get_power_available_status --- platform/mellanox/mlnx-platform-api/sonic_platform/psu.py | 2 +- platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index c412b0d0fd77..80f4948e31e8 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -289,7 +289,7 @@ def get_status_led(self): return self.STATUS_LED_COLOR_OFF - def _get_power_available_status(self): + def get_power_available_status(self): """ Gets the power available status diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 112cf29aa0ad..b45cd71db1a0 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -264,7 +264,7 @@ def initialize_thermals(sku, thermal_list, psu_list): else: if category == THERMAL_DEV_CATEGORY_PSU: for index in range(count): - thermal = Thermal(category, start + index, True, psu_list[index]._get_power_available_status) + thermal = Thermal(category, start + index, True, psu_list[index].get_power_available_status) thermal_list.append(thermal) else: for index in range(count): From 3206e9517f70ddab633fcf0beb2a7eff3e990400 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Fri, 28 Feb 2020 03:39:31 +0200 Subject: [PATCH 05/25] [thermal fix] adjust log for PSU absence and power absence --- platform/mellanox/mlnx-platform-api/sonic_platform/psu.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index 80f4948e31e8..dd22203f8a39 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -295,13 +295,13 @@ def get_power_available_status(self): Returns: True if power is present and power on. - False and "power not present" if power is not present. - False and "power off" if power is present but not power on. + False and "absence of PSU" if power is not present. + False and "absence of power" if power is present but not power on. """ if not self.get_presence(): - return False, "power not present" + return False, "absence of PSU" elif not self.get_powergood_status(): - return False, "power off" + return False, "absence of power" else: return True, "" From 4e689ac841ed04cffc08db797e0331024a7a6329 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Fri, 28 Feb 2020 07:16:22 +0200 Subject: [PATCH 06/25] [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies --- .../tests/policy_with_same_conditions.json | 75 +++++++++++++++++++ .../tests/test_thermal_policy.py | 8 ++ 2 files changed, 83 insertions(+) create mode 100644 platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json diff --git a/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json new file mode 100644 index 000000000000..ace291be1c55 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/policy_with_same_conditions.json @@ -0,0 +1,75 @@ +{ + "thermal_control_algorithm": { + "run_at_boot_up": "false", + "fan_speed_when_suspend": "60" + }, + "info_types": [ + { + "type": "fan_info" + }, + { + "type": "psu_info" + }, + { + "type": "chassis_info" + } + ], + "policies": [ + { + "name": "all fan and psu presence", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "any psu absence", + "conditions": [ + { + "type": "psu.any.absence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, + { + "name": "all fan and psu presence 1", + "conditions": [ + { + "type": "fan.all.presence" + }, + { + "type": "psu.all.presence" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "true" + } + ] + } + ] +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index d0a8447c5e0c..843244e937fa 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -307,4 +307,12 @@ def test_load_empty_action(): with pytest.raises(Exception): policy.load_from_json(json_obj) +def test_load_policy_with_same_conditions(): + from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase + class MockThermalManager(ThermalManagerBase): + pass + + with pytest.raises(Exception): + MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) + From 0b45c45bab13cf6356532e8dc0cc060b4187b972 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Tue, 10 Mar 2020 02:43:46 +0200 Subject: [PATCH 07/25] [thermal] fix fan.get_presence for non-removable SKU --- .../sonic_platform/chassis.py | 4 ++-- .../mlnx-platform-api/sonic_platform/fan.py | 20 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py index fe8b31898387..88ffe425a655 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py @@ -110,9 +110,9 @@ def initialize_fan(self): for index in range(num_of_fan): if multi_rotor_in_drawer: - fan = Fan(has_fan_dir, index, index/2) + fan = Fan(has_fan_dir, index, index/2, False, self.sku_name) else: - fan = Fan(has_fan_dir, index, index) + fan = Fan(has_fan_dir, index, index, False, self.sku_name) self._fan_list.append(fan) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index b212e5e12db4..f11fbd38d920 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -25,17 +25,22 @@ # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" +# SKUs with unplugable FANs: +# 1. don't have fanX_status and should be treated as always present +hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100'] + class Fan(FanBase): """Platform-specific Fan class""" STATUS_LED_COLOR_ORANGE = "orange" - def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False): + def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): # API index is starting from 0, Mellanox platform index is starting from 1 self.index = fan_index + 1 self.drawer_index = drawer_index + 1 self.is_psu_fan = psu_fan + self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True self.fan_min_speed_path = "fan{}_min".format(self.index) if not self.is_psu_fan: @@ -132,11 +137,14 @@ def get_presence(self): else: status = 0 else: - try: - with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: - status = int(presence_status.read()) - except (ValueError, IOError): - status = 0 + if self.always_presence: + status = 1 + else: + try: + with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status: + status = int(presence_status.read()) + except (ValueError, IOError): + status = 0 return status == 1 From bdfc65215e52fe1601d91d9a1bc0a05c47fa1520 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Tue, 10 Mar 2020 05:40:25 +0200 Subject: [PATCH 08/25] [thermal fix] fix issue: fan direction is based on drawer --- platform/mellanox/mlnx-platform-api/sonic_platform/fan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index f11fbd38d920..b7ed8d1cb894 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -91,7 +91,7 @@ def get_direction(self): try: with open(os.path.join(self.fan_dir), 'r') as fan_dir: fan_dir_bits = int(fan_dir.read()) - fan_mask = 1 << self.index - 1 + fan_mask = 1 << self.drawer_index - 1 if fan_dir_bits & fan_mask: return self.FAN_DIRECTION_INTAKE else: From 80e0b887833cdc2db204e4a63951ed0087cf5321 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Tue, 10 Mar 2020 14:43:53 +0200 Subject: [PATCH 09/25] Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A --- platform/mellanox/mlnx-platform-api/sonic_platform/fan.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index b7ed8d1cb894..9ce65d1e2f98 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -85,7 +85,7 @@ def get_direction(self): 1 stands for forward, in other words intake 0 stands for reverse, in other words exhaust """ - if not self.fan_dir or self.is_psu_fan: + if not self.fan_dir or self.is_psu_fan or not self.get_presence(): return self.FAN_DIRECTION_NOT_APPLICABLE try: From 5b46d6fbaf5486f586b068b2507af2bc8e406450 Mon Sep 17 00:00:00 2001 From: Junchao Chen Date: Wed, 11 Mar 2020 03:13:21 +0200 Subject: [PATCH 10/25] [thermal fix] add unit test for get_direction for absent FAN --- .../mlnx-platform-api/tests/test_fan_api.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 platform/mellanox/mlnx-platform-api/tests/test_fan_api.py diff --git a/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py new file mode 100644 index 000000000000..381260163c0f --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/tests/test_fan_api.py @@ -0,0 +1,17 @@ +import os +import sys +from mock import MagicMock + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +from sonic_platform.fan import Fan + + +def test_get_absence_fan_direction(): + fan = Fan(True, 0, 0) + fan.get_presence = MagicMock(return_value=False) + assert fan.fan_dir is not None + assert not fan.is_psu_fan + assert fan.get_direction() == Fan.FAN_DIRECTION_NOT_APPLICABLE From 2d42cd35fad3f856eaa745aea7f6698379121bf4 Mon Sep 17 00:00:00 2001 From: junchao Date: Mon, 16 Mar 2020 10:16:05 +0800 Subject: [PATCH 11/25] Unplugable PSU has no FAN, no need add a FAN object for this PSU --- platform/mellanox/mlnx-platform-api/sonic_platform/psu.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py index dd22203f8a39..d6104b938dfd 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/psu.py @@ -99,8 +99,10 @@ def __init__(self, psu_index, sku): psu_presence = os.path.join(self.psu_path, psu_presence) self.psu_presence = psu_presence - fan = Fan(sku, psu_index, psu_index, True) - self._fan_list.append(fan) + # unplugable PSU has no FAN + if sku not in hwsku_dict_with_unplugable_psu: + fan = Fan(sku, psu_index, psu_index, True) + self._fan_list.append(fan) self.psu_green_led_path = "led_psu_green" self.psu_red_led_path = "led_psu_red" From 6f80098c05c6d0ba9c5cf78cf6f697859413beb5 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 18 Mar 2020 09:48:02 +0800 Subject: [PATCH 12/25] 1. Enable thermal alogrithm by default; 2. set cooling level before set speed; 3. check thermal zone temperature before set speed; 4. enable thermal algorithm by change thermal_zone_mode --- .../thermal_policy.json | 31 ++++- .../mlnx-platform-api/sonic_platform/fan.py | 29 ++++- .../sonic_platform/thermal.py | 106 ++++++++++++++++ .../sonic_platform/thermal_actions.py | 16 +++ .../sonic_platform/thermal_conditions.py | 14 +++ .../sonic_platform/thermal_infos.py | 22 +++- .../sonic_platform/thermal_manager.py | 27 +---- .../mlnx-platform-api/tests/mock_platform.py | 4 + .../tests/test_thermal_policy.py | 114 ++++++++++++++++++ .../tests/thermal_policy.json | 25 ++++ 10 files changed, 359 insertions(+), 29 deletions(-) diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json index 054d797be951..f7e7b103325e 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -1,6 +1,6 @@ { "thermal_control_algorithm": { - "run_at_boot_up": "false", + "run_at_boot_up": "true", "fan_speed_when_suspend": "60" }, "info_types": [ @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,19 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { - "type": "fan.all.set_speed", - "speed": "60" + "type": "thermal_control.control", + "status": "true" + }, + { + "type": "fan.all.check_and_set_speed", + "speed": "60" } ] } diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 9ce65d1e2f98..7b715020e577 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -24,6 +24,7 @@ LED_PATH = "/var/run/hw-management/led/" # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" +COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" # SKUs with unplugable FANs: # 1. don't have fanX_status and should be treated as always present @@ -231,13 +232,15 @@ def set_speed(self, speed): bool: True if set success, False if fail. """ status = True - pwm = int(round(PWM_MAX*speed/100.0)) if self.is_psu_fan: #PSU fan speed is not setable. return False try: + cooling_level = int(speed / 10) + self.set_cooling_level(cooling_level) + pwm = int(round(PWM_MAX*speed/100.0)) with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm: fan_pwm.write(str(pwm)) except (ValueError, IOError): @@ -352,3 +355,27 @@ def get_speed_tolerance(self): """ # The tolerance value is fixed as 20% for all the Mellanox platform return 20 + + @classmethod + def set_cooling_level(cls, level): + """ + Change cooling level. The input level should be an integer value [1, 10]. + 1 means 10%, 2 means 20%, 10 means 100%. + """ + if not isinstance(level, int): + raise RuntimeError("Failed to set cooling level, input parameter must be integer") + + if level < 1 or level > 10: + raise RuntimeError("Failed to set cooling level, level value must be in range [1, 10], got {}".format(level)) + + try: + # reset FAN driver and change cooling state + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(level + 10) + + # make cooling state diplay correct value + with open(COOLING_STATE_PATH, 'w') as cooling_state: + cooling_state.write(level) + except (ValueError, IOError) as e: + raise RuntimeError("Failed to set cooling level - {}".format(e)) + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index b45cd71db1a0..94ddafbc34fa 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -42,6 +42,16 @@ HW_MGMT_THERMAL_ROOT = "/var/run/hw-management/thermal/" +THERMAL_ZONE_ASIC_PATH = "/var/run/hw-management/thermal/mlxsw/" +THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/" +THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/" +THERMAL_ZONE_MODE = "thermal_zone_mode" +THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" +THERMAL_ZONE_NORMAL_TEMPERATURE = "temp_trip_norm" + +MODULE_COUNTER_PATH = "/var/run/hw-management/config/module_counter" +GEARBOX_COUNTER_PATH = "/var/run/hw-management/config/gearbox_counter" + thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", THERMAL_API_GET_HIGH_THRESHOLD:"cpu_core{}_max", @@ -246,6 +256,7 @@ def initialize_thermals(sku, thermal_list, psu_list): # create thermal objects for all categories of sensors tp_index = hwsku_dict_thermal[sku] thermal_profile = thermal_profile_list[tp_index] + Thermal.thermal_profile = thermal_profile for category in thermal_device_categories_all: if category == THERMAL_DEV_CATEGORY_AMBIENT: count, ambient_list = thermal_profile[category] @@ -274,6 +285,9 @@ def initialize_thermals(sku, thermal_list, psu_list): class Thermal(ThermalBase): + thermal_profile = None + thermal_algorithm_status = False + def __init__(self, category, index, has_index, dependency = None): """ index should be a string for category ambient and int for other categories @@ -404,3 +418,95 @@ def get_high_critical_threshold(self): if self.category == THERMAL_DEV_CATEGORY_MODULE and value_float == THERMAL_API_INVALID_HIGH_THRESHOLD: return None return value_float / 1000.0 + + + @classmethod + def _write_generic_file(cls, filename, content): + """ + Write a generic file if content changed + """ + try: + with open(filename, 'w+') as file_obj: + origin_content = file_obj.read() + if origin_content != content: + file_obj.write(content) + except Exception as e: + logger.log_info("Fail to write file {} due to {}".format(filename, repr(e))) + + @classmethod + def set_thermal_algorithm_status(cls, status): + """ + Enable/disable kernel thermal algorithm + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if cls.thermal_algorithm_status == status: + return + + cls.thermal_algorithm_status = status + content = "enabled" if status else "disabled" + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content) + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content) + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) + + @classmethod + def check_thermal_zone_temperature(cls): + """ + Check thermal zone current temperature with normal temperature + + Returns: + True if all thermal zones current temperature less or equal than normal temperature + """ + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_ASIC_PATH): + return False + + if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_MODULE_PATH.format(start + index)): + return False + + if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] + if count != 0: + for index in range(count): + if not cls._check_thermal_zone_temperature(THERMAL_ZONE_GEARBOX_PATH.format(start + index)): + return False + + return True + + + @classmethod + def _check_thermal_zone_temperature(cls, thermal_zone_path): + normal_temp_path = join(thermal_zone_path, THERMAL_ZONE_NORMAL_TEMPERATURE) + current_temp_path = join(thermal_zone_path, THERMAL_ZONE_TEMPERATURE) + normal = None + current = None + try: + with open(normal_temp_path, 'r') as file_obj: + normal = float(file_obj.read()) + + with open(current_temp_path, 'r') as file_obj: + current = float(file_obj.read()) + + return current <= normal + except Exception as e: + logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) + + + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 72729287d1c5..1f6a587a404f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -54,6 +54,22 @@ def execute(self, thermal_info_dict): fan.set_speed(self.speed) +@thermal_json_object('fan.all.check_and_set_speed') +class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): + """ + Action to check thermal zone temperature and recover speed for all fans + """ + def execute(self, thermal_info_dict): + """ + Check thermal zone and set speed for all fans + :param thermal_info_dict: A dictionary stores all thermal information. + :return: + """ + from .thermal import Thermal + if Thermal.check_thermal_zone_temperature(): + SetAllFanSpeedAction.execute(self, thermal_info_dict) + + @thermal_json_object('thermal_control.control') class ControlThermalAlgoAction(ThermalPolicyActionBase): """ diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 2df59acc9bf1..941875b95499 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -32,6 +32,20 @@ def is_match(self, thermal_info_dict): return len(fan_info_obj.get_absence_fans()) == 0 if fan_info_obj else False +@thermal_json_object('fan.any.fault') +class AnyFanFaultCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) > 0 if fan_info_obj else False + + +@thermal_json_object('fan.all.good') +class AllFanGoodCondition(FanCondition): + def is_match(self, thermal_info_dict): + fan_info_obj = self.get_fan_info(thermal_info_dict) + return len(fan_info_obj.get_fault_fans()) == 0 if fan_info_obj else False + + class PsuCondition(ThermalPolicyConditionBase): def get_psu_info(self, thermal_info_dict): from .thermal_infos import PsuInfo diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py index 82c186495f5e..e810a5646456 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_infos.py @@ -14,6 +14,7 @@ class FanInfo(ThermalPolicyInfoBase): def __init__(self): self._absence_fans = set() self._presence_fans = set() + self._fault_fans = set() self._status_changed = False def collect(self, chassis): @@ -24,17 +25,27 @@ def collect(self, chassis): """ self._status_changed = False for fan in chassis.get_all_fans(): - if fan.get_presence() and fan not in self._presence_fans: + presence = fan.get_presence() + status = fan.get_status() + if presence and fan not in self._presence_fans: self._presence_fans.add(fan) self._status_changed = True if fan in self._absence_fans: self._absence_fans.remove(fan) - elif not fan.get_presence() and fan not in self._absence_fans: + elif not presence and fan not in self._absence_fans: self._absence_fans.add(fan) self._status_changed = True if fan in self._presence_fans: self._presence_fans.remove(fan) + if not status and fan not in self._fault_fans: + self._fault_fans.add(fan) + self._status_changed = True + elif status and fan in self._fault_fans: + self._fault_fans.remove(fan) + self._status_changed = True + + def get_absence_fans(self): """ Retrieves absence fans @@ -49,6 +60,13 @@ def get_presence_fans(self): """ return self._presence_fans + def get_fault_fans(self): + """ + Retrieves fault fans + :return: A set of fault fans + """ + return self._fault_fans + def is_status_changed(self): """ Retrieves if the status of fan information changed diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 133bb078ca20..137e6d9da5b8 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -6,8 +6,6 @@ class ThermalManager(ThermalManagerBase): - THERMAL_ALGORITHM_CONTROL_PATH = '/var/run/hw-management/config/suspend' - @classmethod def start_thermal_control_algorithm(cls): """ @@ -16,7 +14,8 @@ def start_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(False) + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(True) @classmethod def stop_thermal_control_algorithm(cls): @@ -26,25 +25,7 @@ def stop_thermal_control_algorithm(cls): Returns: bool: True if set success, False if fail. """ - cls._control_thermal_control_algorithm(True) - - @classmethod - def _control_thermal_control_algorithm(cls, suspend): - """ - Control thermal control algorithm + from .thermal import Thermal + Thermal.set_thermal_algorithm_status(False) - Args: - suspend: Bool, indicate suspend the algorithm or not - - Returns: - bool: True if set success, False if fail. - """ - status = True - write_value = 1 if suspend else 0 - try: - with open(cls.THERMAL_ALGORITHM_CONTROL_PATH, 'w') as control_file: - control_file.write(str(write_value)) - except (ValueError, IOError): - status = False - return status diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index f34ace97968d..c4294ff9039a 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -2,6 +2,7 @@ class MockFan: def __init__(self): self.presence = True self.speed = 60 + self.status = True def get_presence(self): return self.presence @@ -9,6 +10,9 @@ def get_presence(self): def set_speed(self, speed): self.speed = speed + def get_status(self): + return self.status + class MockPsu: def __init__(self): diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 843244e937fa..d1b3ea8b12c7 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -11,6 +11,10 @@ from sonic_platform.thermal_manager import ThermalManager from sonic_platform.thermal_infos import FanInfo, PsuInfo +from sonic_platform.fan import Fan +from sonic_platform.thermal import Thermal + +Thermal.check_thermal_zone_temperature = MagicMock() @pytest.fixture(scope='session', autouse=True) @@ -27,6 +31,7 @@ def test_load_policy(thermal_manager): assert 'any fan absence' in thermal_manager._policy_dict assert 'any psu absence' in thermal_manager._policy_dict + assert 'any fan broken' in thermal_manager._policy_dict assert 'all fan and psu presence' in thermal_manager._policy_dict assert thermal_manager._fan_speed_when_suspend == 60 @@ -40,6 +45,7 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 1 assert len(fan_info.get_presence_fans()) == 0 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() fan_list = chassis.get_all_fans() @@ -47,8 +53,15 @@ def test_fan_info(): fan_info.collect(chassis) assert len(fan_info.get_absence_fans()) == 0 assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 0 assert fan_info.is_status_changed() + fan_list[0].status = False + fan_info.collect(chassis) + assert len(fan_info.get_absence_fans()) == 0 + assert len(fan_info.get_presence_fans()) == 1 + assert len(fan_info.get_fault_fans()) == 1 + assert fan_info.is_status_changed() def test_psu_info(): chassis = MockChassis() @@ -86,8 +99,24 @@ def test_fan_policy(thermal_manager): thermal_manager.stop_thermal_control_algorithm.assert_called_once() fan_list[0].presence = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) thermal_manager.run_policy(chassis) thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.check_thermal_zone_temperature.assert_called_once() + assert fan_list[0].speed == 60 + assert fan_list[1].speed == 60 + + fan_list[0].status = False + thermal_manager.run_policy(chassis) + assert thermal_manager.stop_thermal_control_algorithm.call_count == 2 + + fan_list[0].status = True + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) + thermal_manager.run_policy(chassis) + assert thermal_manager.start_thermal_control_algorithm.call_count == 2 + Thermal.check_thermal_zone_temperature.assert_called_once() + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 def test_psu_policy(thermal_manager): @@ -159,6 +188,44 @@ def test_all_fan_presence_condition(): fan_info.collect(chassis) assert condition.is_match({'fan_info': fan_info}) +def test_any_fan_fault_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AnyFanFaultCondition + condition = AnyFanFaultCondition() + assert condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert not condition.is_match({'fan_info': fan_info}) + +def test_all_fan_good_condition(): + chassis = MockChassis() + fan = MockFan() + fan_list = chassis.get_all_fans() + fan_list.append(fan) + fault_fan = MockFan() + fault_fan.status = False + fan_list.append(fault_fan) + fan_info = FanInfo() + fan_info.collect(chassis) + + from sonic_platform.thermal_conditions import AllFanGoodCondition + condition = AllFanGoodCondition() + assert not condition.is_match({'fan_info': fan_info}) + + fault_fan.status = True + fan_info.collect(chassis) + assert condition.is_match({'fan_info': fan_info}) + def test_any_psu_absence_condition(): chassis = MockChassis() @@ -275,6 +342,53 @@ def test_load_control_thermal_algo_action(): with pytest.raises(ValueError): action.load_from_json(json_obj) +def test_load_check_and_set_speed_action(): + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + json_str = '{\"speed\": \"40\"}' + json_obj = json.loads(json_str) + action.load_from_json(json_obj) + assert action.speed == 40 + + json_str = '{\"speed\": \"-1\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"speed\": \"101\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + + json_str = '{\"invalid\": \"60\"}' + json_obj = json.loads(json_str) + with pytest.raises(ValueError): + action.load_from_json(json_obj) + +def test_execute_check_and_set_fan_speed_action(): + chassis = MockChassis() + fan_list = chassis.get_all_fans() + fan_list.append(MockFan()) + fan_list.append(MockFan()) + fan_info = FanInfo() + fan_info.collect(chassis) + Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) + + from sonic_platform.thermal_actions import CheckAndSetAllFanSpeedAction + action = CheckAndSetAllFanSpeedAction() + action.speed = 99 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 99 + assert fan_list[1].speed == 99 + + Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) + fan_list[0].speed = 100 + fan_list[1].speed = 100 + action.speed = 60 + action.execute({'fan_info': fan_info}) + assert fan_list[0].speed == 100 + assert fan_list[1].speed == 100 + def test_load_duplicate_condition(): from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy with open(os.path.join(test_path, 'duplicate_condition.json')) as f: diff --git a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json index 5d31b2abd875..413211b21220 100644 --- a/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json +++ b/platform/mellanox/mlnx-platform-api/tests/thermal_policy.json @@ -51,6 +51,24 @@ } ] }, + { + "name": "any fan broken", + "conditions": [ + { + "type": "fan.any.fault" + } + ], + "actions": [ + { + "type": "thermal_control.control", + "status": "false" + }, + { + "type": "fan.all.set_speed", + "speed": "100" + } + ] + }, { "name": "all fan and psu presence", "conditions": [ @@ -59,12 +77,19 @@ }, { "type": "psu.all.presence" + }, + { + "type": "fan.all.good" } ], "actions": [ { "type": "thermal_control.control", "status": "true" + }, + { + "type": "fan.all.check_and_set_speed", + "speed": "60" } ] } From d89120014f608ebb1b5996014205e463b883d925 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 18 Mar 2020 12:04:38 +0800 Subject: [PATCH 13/25] start thermal algorithm should also check thermal zone temperature --- .../x86_64-mlnx_msn2700-r0/thermal_policy.json | 4 ---- .../sonic_platform/thermal_actions.py | 14 ++++++++++++-- .../mlnx-platform-api/tests/mock_platform.py | 7 +++++-- .../mlnx-platform-api/tests/test_thermal_policy.py | 4 ++-- 4 files changed, 19 insertions(+), 10 deletions(-) diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json index f7e7b103325e..f16f68dd002e 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json @@ -86,10 +86,6 @@ { "type": "thermal_control.control", "status": "true" - }, - { - "type": "fan.all.check_and_set_speed", - "speed": "60" } ] } diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 1f6a587a404f..128c2dc910e6 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -112,13 +112,23 @@ def execute(self, thermal_info_dict): :return: """ from .thermal_infos import ChassisInfo + from .thermal_infos import FanInfo + from .thermal import Thermal if ChassisInfo.INFO_NAME in thermal_info_dict: chassis_info_obj = thermal_info_dict[ChassisInfo.INFO_NAME] chassis = chassis_info_obj.get_chassis() thermal_manager = chassis.get_thermal_manager() if self.status: thermal_manager.start_thermal_control_algorithm() + + # Check thermal zone temperature, if all thermal zone temperature + # return to normal and FAN speed is still 100%, set it to 60% to + # save power + if Thermal.check_thermal_zone_temperature(): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + for fan in fan_info_obj.get_presence_fans(): + if fan.get_target_speed() != 100: + break + fan.set_speed(60) else: thermal_manager.stop_thermal_control_algorithm() - - diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index c4294ff9039a..ff8adb66ac91 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -1,18 +1,21 @@ class MockFan: + speed = 60 def __init__(self): self.presence = True - self.speed = 60 self.status = True def get_presence(self): return self.presence def set_speed(self, speed): - self.speed = speed + MockFan.speed = speed def get_status(self): return self.status + def get_target_speed(self): + return MockFan.speed + class MockPsu: def __init__(self): diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index d1b3ea8b12c7..867345cc2b6f 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -102,7 +102,7 @@ def test_fan_policy(thermal_manager): Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) thermal_manager.run_policy(chassis) thermal_manager.start_thermal_control_algorithm.assert_called_once() - Thermal.check_thermal_zone_temperature.assert_called_once() + assert Thermal.check_thermal_zone_temperature.call_count == 2 assert fan_list[0].speed == 60 assert fan_list[1].speed == 60 @@ -114,7 +114,7 @@ def test_fan_policy(thermal_manager): Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) thermal_manager.run_policy(chassis) assert thermal_manager.start_thermal_control_algorithm.call_count == 2 - Thermal.check_thermal_zone_temperature.assert_called_once() + assert Thermal.check_thermal_zone_temperature.call_count == 2 assert fan_list[0].speed == 100 assert fan_list[1].speed == 100 From 71c366532789d45e8a50a51cbb49cdc0ae9627ec Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 19 Mar 2020 15:23:24 +0800 Subject: [PATCH 14/25] Should write string to file --- platform/mellanox/mlnx-platform-api/sonic_platform/fan.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 7b715020e577..fa1ccb5d8e8b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -371,11 +371,11 @@ def set_cooling_level(cls, level): try: # reset FAN driver and change cooling state with open(COOLING_STATE_PATH, 'w') as cooling_state: - cooling_state.write(level + 10) + cooling_state.write(str(level + 10)) # make cooling state diplay correct value with open(COOLING_STATE_PATH, 'w') as cooling_state: - cooling_state.write(level) + cooling_state.write(str(level)) except (ValueError, IOError) as e: raise RuntimeError("Failed to set cooling level - {}".format(e)) From f40919594189bf6efcce54fccbac2e7de4ff0926 Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 19 Mar 2020 15:42:42 +0800 Subject: [PATCH 15/25] We should force enable or disable thermal algo when thermal control daemon start up --- .../sonic_platform/thermal.py | 4 +-- .../sonic_platform/thermal_actions.py | 30 +++++++------------ .../tests/test_thermal_policy.py | 17 +++++------ 3 files changed, 20 insertions(+), 31 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 94ddafbc34fa..b68d129ac52c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -434,14 +434,14 @@ def _write_generic_file(cls, filename, content): logger.log_info("Fail to write file {} due to {}".format(filename, repr(e))) @classmethod - def set_thermal_algorithm_status(cls, status): + def set_thermal_algorithm_status(cls, status, force=True): """ Enable/disable kernel thermal algorithm """ if not cls.thermal_profile: raise Exception("Fail to get thermal profile for this switch") - if cls.thermal_algorithm_status == status: + if not force and cls.thermal_algorithm_status == status: return cls.thermal_algorithm_status = status diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 128c2dc910e6..86db8f203c88 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -111,24 +111,16 @@ def execute(self, thermal_info_dict): :param thermal_info_dict: A dictionary stores all thermal information. :return: """ - from .thermal_infos import ChassisInfo from .thermal_infos import FanInfo from .thermal import Thermal - if ChassisInfo.INFO_NAME in thermal_info_dict: - chassis_info_obj = thermal_info_dict[ChassisInfo.INFO_NAME] - chassis = chassis_info_obj.get_chassis() - thermal_manager = chassis.get_thermal_manager() - if self.status: - thermal_manager.start_thermal_control_algorithm() - - # Check thermal zone temperature, if all thermal zone temperature - # return to normal and FAN speed is still 100%, set it to 60% to - # save power - if Thermal.check_thermal_zone_temperature(): - fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] - for fan in fan_info_obj.get_presence_fans(): - if fan.get_target_speed() != 100: - break - fan.set_speed(60) - else: - thermal_manager.stop_thermal_control_algorithm() + Thermal.set_thermal_algorithm_status(self.status, False) + if self.status: + # Check thermal zone temperature, if all thermal zone temperature + # return to normal and FAN speed is still 100%, set it to 60% to + # save power + if Thermal.check_thermal_zone_temperature(): + fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + for fan in fan_info_obj.get_presence_fans(): + if fan.get_target_speed() != 100: + break + fan.set_speed(60) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 867345cc2b6f..128fa16de040 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -15,6 +15,7 @@ from sonic_platform.thermal import Thermal Thermal.check_thermal_zone_temperature = MagicMock() +Thermal.set_thermal_algorithm_status = MagicMock() @pytest.fixture(scope='session', autouse=True) @@ -90,30 +91,28 @@ def test_fan_policy(thermal_manager): chassis = MockChassis() chassis.make_fan_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[1].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) fan_list[0].presence = True Thermal.check_thermal_zone_temperature = MagicMock(return_value=True) thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) assert Thermal.check_thermal_zone_temperature.call_count == 2 assert fan_list[0].speed == 60 assert fan_list[1].speed == 60 fan_list[0].status = False thermal_manager.run_policy(chassis) - assert thermal_manager.stop_thermal_control_algorithm.call_count == 2 + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) fan_list[0].status = True Thermal.check_thermal_zone_temperature = MagicMock(return_value=False) thermal_manager.run_policy(chassis) - assert thermal_manager.start_thermal_control_algorithm.call_count == 2 + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) assert Thermal.check_thermal_zone_temperature.call_count == 2 assert fan_list[0].speed == 100 assert fan_list[1].speed == 100 @@ -123,18 +122,16 @@ def test_psu_policy(thermal_manager): chassis = MockChassis() chassis.make_psu_absence() chassis.fan_list.append(MockFan()) - thermal_manager.start_thermal_control_algorithm = MagicMock() - thermal_manager.stop_thermal_control_algorithm = MagicMock() thermal_manager.run_policy(chassis) fan_list = chassis.get_all_fans() assert fan_list[0].speed == 100 - thermal_manager.stop_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(False, False) psu_list = chassis.get_all_psus() psu_list[0].presence = True thermal_manager.run_policy(chassis) - thermal_manager.start_thermal_control_algorithm.assert_called_once() + Thermal.set_thermal_algorithm_status.assert_called_with(True, False) def test_any_fan_absence_condition(): From 12a9d8fea0c768978e4d0e4728dae1451e66c87a Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 19 Mar 2020 16:32:08 +0800 Subject: [PATCH 16/25] Change thermal algorithm status should also change thermal zone policy --- .../mellanox/mlnx-platform-api/sonic_platform/thermal.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index b68d129ac52c..181c3ab5e3bc 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -46,6 +46,7 @@ THERMAL_ZONE_MODULE_PATH = "/var/run/hw-management/thermal/mlxsw-module{}/" THERMAL_ZONE_GEARBOX_PATH = "/var/run/hw-management/thermal/mlxsw-gearbox{}/" THERMAL_ZONE_MODE = "thermal_zone_mode" +THERMAL_ZONE_POLICY = "thermal_zone_policy" THERMAL_ZONE_TEMPERATURE = "thermal_zone_temp" THERMAL_ZONE_NORMAL_TEMPERATURE = "temp_trip_norm" @@ -446,19 +447,23 @@ def set_thermal_algorithm_status(cls, status, force=True): cls.thermal_algorithm_status = status content = "enabled" if status else "disabled" + policy = "step_wise" if status else "user_space" cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_ASIC_PATH, THERMAL_ZONE_POLICY), policy) if THERMAL_DEV_CATEGORY_MODULE in cls.thermal_profile: start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] if count != 0: for index in range(count): cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_MODULE_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) if THERMAL_DEV_CATEGORY_GEARBOX in cls.thermal_profile: start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_GEARBOX] if count != 0: for index in range(count): cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_MODE), content) + cls._write_generic_file(join(THERMAL_ZONE_GEARBOX_PATH.format(start + index), THERMAL_ZONE_POLICY), policy) @classmethod def check_thermal_zone_temperature(cls): From 766aff02fe38be754e968c1b30f4c20790317ac7 Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 26 Mar 2020 19:32:41 +0800 Subject: [PATCH 17/25] Add fan speed dynamic minimum value --- .../sonic_platform/device_data.py | 134 ++++++++++++++++++ .../mlnx-platform-api/sonic_platform/fan.py | 13 ++ .../sonic_platform/thermal.py | 30 +++- .../sonic_platform/thermal_actions.py | 30 +++- .../sonic_platform/thermal_conditions.py | 26 ++++ .../sonic_platform/thermal_manager.py | 17 ++- .../tests/test_thermal_policy.py | 43 ++++++ 7 files changed, 289 insertions(+), 4 deletions(-) create mode 100644 platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py new file mode 100644 index 000000000000..f12ae0d5f323 --- /dev/null +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py @@ -0,0 +1,134 @@ +DEVICE_DATA = { + 'ACS-MSN2700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'LS-SN2700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'ACS-MSN2740': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":13}, + "p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15}, + "c2p_trust": {"-127:120":13}, + "c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + "unk_trust": {"-127:120":13}, + "unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17}, + } + } + }, + 'ACS-MSN2410': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'Mellanox-SN2700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'Mellanox-SN2700-D48C8': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:40":13, "41:120":15}, + "p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16}, + "c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}, + "unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16} + } + } + }, + 'ACS-MSN2100': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:40":12, "41:120":13}, + "c2p_untrust": {"-127:40":12, "41:120":13}, + "unk_trust": {"-127:40":12, "41:120":13}, + "unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16} + } + } + }, + 'ACS-MSN2010': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:120":12}, + "p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16}, + "c2p_trust": {"-127:120":12}, + "c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16}, + "unk_trust": {"-127:120":12}, + "unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16} + } + } + }, + 'ACS-MSN3700': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_trust": {"-127:25":12, "26:40":13 , "41:120":14}, + "unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16}, + } + } + }, + 'ACS-MSN3800': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:35":12, "36:120":13}, + "p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + "c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + } + } + }, + 'Mellanox-SN3800-D112C8': { + 'thermal': { + 'minimum_table': { + "p2c_trust": {"-127:35":12, "36:120":13}, + "p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + "c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16}, + "unk_trust": {"-127:30":12, "31:40":13 , "41:120":14}, + "unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17}, + } + } + }, +} \ No newline at end of file diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 9538a701e1ea..14bb873ee8ef 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -38,6 +38,7 @@ class Fan(FanBase): """Platform-specific Fan class""" STATUS_LED_COLOR_ORANGE = "orange" + min_cooling_level = 2 def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): # API index is starting from 0, Mellanox platform index is starting from 1 @@ -243,6 +244,9 @@ def set_speed(self, speed): try: cooling_level = int(speed / 10) + if cooling_level < self.min_cooling_level: + cooling_level = self.min_cooling_level + speed = self.min_cooling_level * 10 self.set_cooling_level(cooling_level) pwm = int(round(PWM_MAX*speed/100.0)) with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm: @@ -383,3 +387,12 @@ def set_cooling_level(cls, level): except (ValueError, IOError) as e: raise RuntimeError("Failed to set cooling level - {}".format(e)) + @classmethod + def get_cooling_level(cls): + try: + with open(COOLING_STATE_PATH, 'r') as cooling_state: + cooling_level = int(cooling_state.read()) + return cooling_level + except (ValueError, IOError) as e: + raise RuntimeError("Failed to get cooling level - {}".format(e)) + diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index fc6305666b6f..54976a66dde4 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -52,6 +52,7 @@ MODULE_COUNTER_PATH = "/var/run/hw-management/config/module_counter" GEARBOX_COUNTER_PATH = "/var/run/hw-management/config/gearbox_counter" +MODULE_TEMPERATURE_FAULT_PATH = "/var/run/hw-management/thermal/module{}_temp_fault" thermal_api_handler_cpu_core = { THERMAL_API_GET_TEMPERATURE:"cpu_core{}", @@ -336,7 +337,8 @@ def get_name(self): return self.name - def _read_generic_file(self, filename, len): + @classmethod + def _read_generic_file(cls, filename, len): """ Read a generic file, returns the contents of the file """ @@ -511,7 +513,6 @@ def check_thermal_zone_temperature(cls): return True - @classmethod def _check_thermal_zone_temperature(cls, thermal_zone_path): normal_temp_path = join(thermal_zone_path, THERMAL_ZONE_NORMAL_TEMPERATURE) @@ -529,5 +530,30 @@ def _check_thermal_zone_temperature(cls, thermal_zone_path): except Exception as e: logger.log_info("Fail to check thermal zone temperature for file {} due to {}".format(thermal_zone_path, repr(e))) + @classmethod + def check_module_temperature_trustable(cls): + if not cls.thermal_profile: + raise Exception("Fail to get thermal profile for this switch") + start, count = cls.thermal_profile[THERMAL_DEV_CATEGORY_MODULE] + for index in range(count): + fault_file_path = MODULE_TEMPERATURE_FAULT_PATH.format(index + start) + fault = cls._read_generic_file(fault_file_path, 0) + if fault.strip() != '0': + return 'untrust' + return 'trust' + @classmethod + def get_air_flow_direction(cls): + fan_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_FAN_AMBIENT) + port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) + + # if there is any exception, let it raise + fan_ambient_temp = int(self._read_generic_file(fan_ambient_path)) + port_ambient_temp = int(self._read_generic_file(port_ambient_path)) + if fan_ambient_temp > port_ambient_temp: + return 'p2c', fan_ambient_temp + elif fan_ambient_temp < port_ambient_temp: + return 'c2p', port_ambient_temp + else: + return 'unk', fan_ambient_temp diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 86db8f203c88..335e3e2f3ea9 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -113,6 +113,7 @@ def execute(self, thermal_info_dict): """ from .thermal_infos import FanInfo from .thermal import Thermal + from .fan import Fan Thermal.set_thermal_algorithm_status(self.status, False) if self.status: # Check thermal zone temperature, if all thermal zone temperature @@ -123,4 +124,31 @@ def execute(self, thermal_info_dict): for fan in fan_info_obj.get_presence_fans(): if fan.get_target_speed() != 100: break - fan.set_speed(60) + fan.set_speed(Fan.min_cooling_level * 10) + + +class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): + UNKNOWN_SKU_COOLING_LEVEL = 6 + def execute(self, thermal_info_dict): + from .device_data import DEVICE_DATA + from .fan import Fan + from .thermal_infos import ChassisInfo + from .thermal_conditions import MinCoolingLevelChangeCondition + + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + if chassis.sku_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.sku_name] or 'minimum_table' not in DEVICE_DATA[chassis.sku_name]['thermal']: + Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL + return + + air_flow_dir = MinCoolingLevelChangeCondition.air_flow_dir + trust_state = MinCoolingLevelChangeCondition.trust_state + temperature = MinCoolingLevelChangeCondition.temperature + minimum_table = DEVICE_DATA[chassis.sku_name]['thermal']['minimum_table']['{}_{}'.format(air_flow_dir, trust_state)] + + for key, cooling_level in minimum_table.items(): + temp_range = key.split(':') + temp_min = int(temp_range[0]) * 1000 + temp_max = int(temp_range[1]) * 1000 + if temp_min <= temperature <= temp_max: + Fan.min_cooling_level = cooling_level - 10 + break diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 941875b95499..c283f4e6489d 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -75,3 +75,29 @@ def is_match(self, thermal_info_dict): psu_info_obj = self.get_psu_info(thermal_info_dict) return len(psu_info_obj.get_absence_psus()) == 0 if psu_info_obj else False + +class MinCoolingLevelChangeCondition(ThermalPolicyConditionBase): + trust_state = None + air_flow_dir = None + temperature = None + + def is_match(self, thermal_info_dict): + from .thermal import Thermal + + trust_state = Thermal.check_module_temperature_trustable() + air_flow_dir, temperature = Thermal.get_air_flow_direction() + + change_cooling_level = False + if trust_state != self.trust_state: + self.trust_state = trust_state + change_cooling_level = True + + if air_flow_dir != self.air_flow_dir: + self.air_flow_dir = air_flow_dir + change_cooling_level = True + + if temperature != self.temperature: + self.temperature = temperature + change_cooling_level = True + + return change_cooling_level diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index 137e6d9da5b8..efcfb154717f 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -1,11 +1,21 @@ import os from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase +from sonic_platform_base.sonic_thermal_control.thermal_policy import ThermalPolicy from .thermal_actions import * from .thermal_conditions import * from .thermal_infos import * class ThermalManager(ThermalManagerBase): + @classmethod + def initialize(cls): + """ + Initialize thermal manager, including register thermal condition types and thermal action types + and any other vendor specific initialization. + :return: + """ + cls._add_private_thermal_policy() + @classmethod def start_thermal_control_algorithm(cls): """ @@ -28,4 +38,9 @@ def stop_thermal_control_algorithm(cls): from .thermal import Thermal Thermal.set_thermal_algorithm_status(False) - + @classmethod + def _add_private_thermal_policy(cls): + policy = ThermalPolicy() + policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() + policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() + cls._policy_dict['DynamicMinCoolingLevelPolicy'] = policy diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 128fa16de040..73342a75f335 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -426,4 +426,47 @@ class MockThermalManager(ThermalManagerBase): with pytest.raises(Exception): MockThermalManager.load(os.path.join(test_path, 'policy_with_same_conditions.json')) +def test_dynamic_minimum_table_data(): + from sonic_platform.device_data import DEVICE_DATA + for sku, sku_data in DEVICE_DATA.items(): + if 'thermal' in sku_data and 'minimum_table' in sku_data['thermal']: + minimum_table = sku_data['thermal']['minimum_table'] + check_minimum_table_data(sku, minimum_table) + +def check_minimum_table_data(sku, minimum_table): + valid_dir = ['p2c', 'c2p', 'unk'] + valid_trust_state = ['trust', 'untrust'] + + for category, data in minimum_table.items(): + key_data = category.split('_') + assert key_data[0] in valid_dir + assert key_data[1] in valid_trust_state + + data_list = [(value, key) for key, value in data.items()] + data_list.sort(key=lambda x : x[0]) + + previous_edge = None + previous_cooling_level = None + for item in data_list: + cooling_level = item[0] + range_str = item[1] + + ranges = range_str.split(':') + low = int(ranges[0]) + high = int(ranges[1]) + assert low < high + + if previous_edge is None: + assert low == -127 + else: + assert low - previous_edge == 1, '{}-{}-{} error, item={}'.format(sku, key_data[0], key_data[1], item) + previous_edge = high + + assert 10 <= cooling_level <= 20 + if previous_cooling_level is not None: + assert cooling_level > previous_cooling_level + previous_cooling_level = cooling_level + + + From 1bcec635f3e4652d3a9eb37e04e45431b1346afb Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 26 Mar 2020 20:13:01 +0800 Subject: [PATCH 18/25] Add unit test for DynamicMinCoolingLevelPolicy --- .../sonic_platform/thermal.py | 4 +- .../sonic_platform/thermal_conditions.py | 12 ++--- .../tests/test_thermal_policy.py | 46 ++++++++++++++++++- 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index 54976a66dde4..a9a83daccd7c 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -549,8 +549,8 @@ def get_air_flow_direction(cls): port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) # if there is any exception, let it raise - fan_ambient_temp = int(self._read_generic_file(fan_ambient_path)) - port_ambient_temp = int(self._read_generic_file(port_ambient_path)) + fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path)) + port_ambient_temp = int(cls._read_generic_file(port_ambient_path)) if fan_ambient_temp > port_ambient_temp: return 'p2c', fan_ambient_temp elif fan_ambient_temp < port_ambient_temp: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index c283f4e6489d..8593b0f32881 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -88,16 +88,16 @@ def is_match(self, thermal_info_dict): air_flow_dir, temperature = Thermal.get_air_flow_direction() change_cooling_level = False - if trust_state != self.trust_state: - self.trust_state = trust_state + if trust_state != MinCoolingLevelChangeCondition.trust_state: + MinCoolingLevelChangeCondition.trust_state = trust_state change_cooling_level = True - if air_flow_dir != self.air_flow_dir: - self.air_flow_dir = air_flow_dir + if air_flow_dir != MinCoolingLevelChangeCondition.air_flow_dir: + MinCoolingLevelChangeCondition.air_flow_dir = air_flow_dir change_cooling_level = True - if temperature != self.temperature: - self.temperature = temperature + if temperature != MinCoolingLevelChangeCondition.temperature: + MinCoolingLevelChangeCondition.temperature = temperature change_cooling_level = True return change_cooling_level diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 73342a75f335..0672d63a5914 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -467,6 +467,48 @@ def check_minimum_table_data(sku, minimum_table): assert cooling_level > previous_cooling_level previous_cooling_level = cooling_level +def test_dynamic_minimum_policy(thermal_manager): + from sonic_platform.thermal_conditions import MinCoolingLevelChangeCondition + from sonic_platform.thermal_actions import ChangeMinCoolingLevelAction + from sonic_platform.thermal_infos import ChassisInfo + from sonic_platform.thermal import Thermal + from sonic_platform.fan import Fan + ThermalManager.initialize() + assert 'DynamicMinCoolingLevelPolicy' in thermal_manager._policy_dict + policy = thermal_manager._policy_dict['DynamicMinCoolingLevelPolicy'] + assert MinCoolingLevelChangeCondition in policy.conditions + assert ChangeMinCoolingLevelAction in policy.actions + + condition = policy.conditions[MinCoolingLevelChangeCondition] + action = policy.actions[ChangeMinCoolingLevelAction] + Thermal.check_module_temperature_trustable = MagicMock(return_value='trust') + Thermal.get_air_flow_direction = MagicMock(return_value=('p2c', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'trust' + assert MinCoolingLevelChangeCondition.air_flow_dir == 'p2c' + assert MinCoolingLevelChangeCondition.temperature == 35000 + assert not condition.is_match(None) + + Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.trust_state == 'untrust' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 35000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.air_flow_dir == 'c2p' + + Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 25000)) + assert condition.is_match(None) + assert MinCoolingLevelChangeCondition.temperature == 25000 - - + chassis = MockChassis() + chassis.sku_name = 'invalid' + info = ChassisInfo() + info._chassis = chassis + thermal_info_dict = {ChassisInfo.INFO_NAME: info} + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 6 + + chassis.sku_name = 'ACS-MSN2700' + action.execute(thermal_info_dict) + assert Fan.min_cooling_level == 4 From c6ed366ec8320afb811b2f22c05c21ac3b7fdc10 Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 27 Mar 2020 09:54:37 +0800 Subject: [PATCH 19/25] If current cooling state below minimum cooling state, set it to minimum cooling state --- .../sonic_platform/thermal_actions.py | 31 ++++++++++--------- .../tests/test_thermal_policy.py | 5 +++ 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 335e3e2f3ea9..4bde98fcdc5b 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -138,17 +138,20 @@ def execute(self, thermal_info_dict): chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() if chassis.sku_name not in DEVICE_DATA or 'thermal' not in DEVICE_DATA[chassis.sku_name] or 'minimum_table' not in DEVICE_DATA[chassis.sku_name]['thermal']: Fan.min_cooling_level = ChangeMinCoolingLevelAction.UNKNOWN_SKU_COOLING_LEVEL - return - - air_flow_dir = MinCoolingLevelChangeCondition.air_flow_dir - trust_state = MinCoolingLevelChangeCondition.trust_state - temperature = MinCoolingLevelChangeCondition.temperature - minimum_table = DEVICE_DATA[chassis.sku_name]['thermal']['minimum_table']['{}_{}'.format(air_flow_dir, trust_state)] - - for key, cooling_level in minimum_table.items(): - temp_range = key.split(':') - temp_min = int(temp_range[0]) * 1000 - temp_max = int(temp_range[1]) * 1000 - if temp_min <= temperature <= temp_max: - Fan.min_cooling_level = cooling_level - 10 - break + else: + air_flow_dir = MinCoolingLevelChangeCondition.air_flow_dir + trust_state = MinCoolingLevelChangeCondition.trust_state + temperature = MinCoolingLevelChangeCondition.temperature + minimum_table = DEVICE_DATA[chassis.sku_name]['thermal']['minimum_table']['{}_{}'.format(air_flow_dir, trust_state)] + + for key, cooling_level in minimum_table.items(): + temp_range = key.split(':') + temp_min = int(temp_range[0]) * 1000 + temp_max = int(temp_range[1]) * 1000 + if temp_min <= temperature <= temp_max: + Fan.min_cooling_level = cooling_level - 10 + break + + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level < Fan.min_cooling_level: + Fan.set_cooling_level(Fan.min_cooling_level) diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index 0672d63a5914..a89cd75c6e3b 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -506,9 +506,14 @@ def test_dynamic_minimum_policy(thermal_manager): info = ChassisInfo() info._chassis = chassis thermal_info_dict = {ChassisInfo.INFO_NAME: info} + Fan.get_cooling_level = MagicMock(return_value=5) + Fan.set_cooling_level = MagicMock() action.execute(thermal_info_dict) assert Fan.min_cooling_level == 6 + Fan.set_cooling_level.assert_called_with(6) + Fan.set_cooling_level.call_count = 0 chassis.sku_name = 'ACS-MSN2700' action.execute(thermal_info_dict) assert Fan.min_cooling_level == 4 + assert Fan.set_cooling_level.call_count == 0 From 27fdea46aadd3546bea67040126934fda37260c3 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 1 Apr 2020 19:29:38 +0800 Subject: [PATCH 20/25] Enable changing PSU fan speed --- .../mlnx-platform-api/sonic_platform/fan.py | 25 ++++++++++++++++--- .../sonic_platform/thermal.py | 4 +-- .../sonic_platform/thermal_actions.py | 16 ++++++++++-- .../sonic_platform/thermal_conditions.py | 14 +++++++++++ .../sonic_platform/thermal_manager.py | 13 +++++++--- .../tests/test_thermal_policy.py | 4 +-- 6 files changed, 63 insertions(+), 13 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index 14bb873ee8ef..e078557b840e 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -9,6 +9,7 @@ ############################################################################# import os.path +import subprocess try: from sonic_platform_base.fan_base import FanBase @@ -22,6 +23,7 @@ FAN_PATH = "/var/run/hw-management/thermal/" LED_PATH = "/var/run/hw-management/led/" +CONFIG_PATH = "/var/run/hw-management/config" # fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches FAN_DIR = "/var/run/hw-management/system/fan_dir" COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state" @@ -39,6 +41,9 @@ class Fan(FanBase): STATUS_LED_COLOR_ORANGE = "orange" min_cooling_level = 2 + # PSU fan speed vector + PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c', + '0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64'] def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None): # API index is starting from 0, Mellanox platform index is starting from 1 @@ -60,6 +65,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None + self.psu_i2c_bus_path = join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) + self.psu_i2c_addr_path = join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) + self.psu_i2c_command_path = join(CONFIG_PATH, 'fan_command') + self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index) @@ -239,9 +248,19 @@ def set_speed(self, speed): status = True if self.is_psu_fan: - #PSU fan speed is not setable. - return False - + try: + with open(self.psu_i2c_bus_path, 'r') as f: + bus = f.read().strip() + with open(self.psu_i2c_addr_path, 'r') as f: + addr = f.read().strip() + with open(self.psu_i2c_command_path, 'r') as f: + command = f.read().strip() + speed = Fan.PSU_FAN_SPEED[int(speed / 10)] + subprocess.call("i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed), shell = True) + return True + except Exception as e: + return False + try: cooling_level = int(speed / 10) if cooling_level < self.min_cooling_level: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py index a9a83daccd7c..66ff58f7ae07 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py @@ -549,8 +549,8 @@ def get_air_flow_direction(cls): port_ambient_path = join(HW_MGMT_THERMAL_ROOT, THERMAL_DEV_PORT_AMBIENT) # if there is any exception, let it raise - fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path)) - port_ambient_temp = int(cls._read_generic_file(port_ambient_path)) + fan_ambient_temp = int(cls._read_generic_file(fan_ambient_path, 0)) + port_ambient_temp = int(cls._read_generic_file(port_ambient_path, 0)) if fan_ambient_temp > port_ambient_temp: return 'p2c', fan_ambient_temp elif fan_ambient_temp < port_ambient_temp: diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 4bde98fcdc5b..3a364d594057 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -146,8 +146,8 @@ def execute(self, thermal_info_dict): for key, cooling_level in minimum_table.items(): temp_range = key.split(':') - temp_min = int(temp_range[0]) * 1000 - temp_max = int(temp_range[1]) * 1000 + temp_min = int(temp_range[0]) + temp_max = int(temp_range[1]) if temp_min <= temperature <= temp_max: Fan.min_cooling_level = cooling_level - 10 break @@ -155,3 +155,15 @@ def execute(self, thermal_info_dict): current_cooling_level = Fan.get_cooling_level() if current_cooling_level < Fan.min_cooling_level: Fan.set_cooling_level(Fan.min_cooling_level) + + +class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): + def execute(self, thermal_info_dict): + from .thermal_conditions import CoolingLevelChangeCondition + from .thermal_infos import ChassisInfo + + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + cooling_level = CoolingLevelChangeCondition.cooling_level + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(cooling_level * 10) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index 8593b0f32881..bc5a1bb68276 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -86,6 +86,7 @@ def is_match(self, thermal_info_dict): trust_state = Thermal.check_module_temperature_trustable() air_flow_dir, temperature = Thermal.get_air_flow_direction() + temperature = temperature / 1000 change_cooling_level = False if trust_state != MinCoolingLevelChangeCondition.trust_state: @@ -101,3 +102,16 @@ def is_match(self, thermal_info_dict): change_cooling_level = True return change_cooling_level + + +class CoolingLevelChangeCondition(ThermalPolicyConditionBase): + cooling_level = None + + def is_match(self, thermal_info_dict): + from .fan import Fan + current_cooling_level = Fan.get_cooling_level() + if current_cooling_level != cooling_level: + CoolingLevelChangeCondition.cooling_level = current_cooling_level + return True + else: + return False diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py index efcfb154717f..d70a5bcd1a03 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_manager.py @@ -40,7 +40,12 @@ def stop_thermal_control_algorithm(cls): @classmethod def _add_private_thermal_policy(cls): - policy = ThermalPolicy() - policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() - policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() - cls._policy_dict['DynamicMinCoolingLevelPolicy'] = policy + dynamic_min_speed_policy = ThermalPolicy() + dynamic_min_speed_policy.conditions[MinCoolingLevelChangeCondition] = MinCoolingLevelChangeCondition() + dynamic_min_speed_policy.actions[ChangeMinCoolingLevelAction] = ChangeMinCoolingLevelAction() + cls._policy_dict['DynamicMinCoolingLevelPolicy'] = dynamic_min_speed_policy + + update_psu_fan_speed_policy = ThermalPolicy() + update_psu_fan_speed_policy.conditions[CoolingLevelChangeCondition] = CoolingLevelChangeCondition() + update_psu_fan_speed_policy.actions[UpdatePsuFanSpeedAction] = UpdatePsuFanSpeedAction() + cls._policy_dict['UpdatePsuFanSpeedPolicy'] = update_psu_fan_speed_policy diff --git a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py index a89cd75c6e3b..789e316bb540 100644 --- a/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py +++ b/platform/mellanox/mlnx-platform-api/tests/test_thermal_policy.py @@ -486,7 +486,7 @@ def test_dynamic_minimum_policy(thermal_manager): assert condition.is_match(None) assert MinCoolingLevelChangeCondition.trust_state == 'trust' assert MinCoolingLevelChangeCondition.air_flow_dir == 'p2c' - assert MinCoolingLevelChangeCondition.temperature == 35000 + assert MinCoolingLevelChangeCondition.temperature == 35 assert not condition.is_match(None) Thermal.check_module_temperature_trustable = MagicMock(return_value='untrust') @@ -499,7 +499,7 @@ def test_dynamic_minimum_policy(thermal_manager): Thermal.get_air_flow_direction = MagicMock(return_value=('c2p', 25000)) assert condition.is_match(None) - assert MinCoolingLevelChangeCondition.temperature == 25000 + assert MinCoolingLevelChangeCondition.temperature == 25 chassis = MockChassis() chassis.sku_name = 'invalid' From f61a3636f468578c6daa7d139448751a1fa604e2 Mon Sep 17 00:00:00 2001 From: junchao Date: Wed, 1 Apr 2020 19:37:46 +0800 Subject: [PATCH 21/25] install i2c-tool in pmon docker --- dockers/docker-platform-monitor/Dockerfile.j2 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dockers/docker-platform-monitor/Dockerfile.j2 b/dockers/docker-platform-monitor/Dockerfile.j2 index fd11f628559c..1763eb9bac0b 100755 --- a/dockers/docker-platform-monitor/Dockerfile.j2 +++ b/dockers/docker-platform-monitor/Dockerfile.j2 @@ -18,7 +18,8 @@ RUN apt-get update && \ rrdtool \ python-smbus \ ethtool \ - dmidecode + dmidecode \ + i2c-tools {% if docker_platform_monitor_debs.strip() -%} # Copy locally-built Debian package dependencies From b285bc84b25b2a65e5df3565fbadd61631e6588a Mon Sep 17 00:00:00 2001 From: junchao Date: Thu, 2 Apr 2020 09:55:22 +0800 Subject: [PATCH 22/25] fix issue found in manual test --- platform/mellanox/mlnx-platform-api/sonic_platform/fan.py | 6 +++--- .../mlnx-platform-api/sonic_platform/thermal_conditions.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py index e078557b840e..15f5bdf5ed32 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/fan.py @@ -65,9 +65,9 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index) self._name = 'psu_{}_fan_{}'.format(self.index, 1) self.fan_max_speed_path = None - self.psu_i2c_bus_path = join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) - self.psu_i2c_addr_path = join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) - self.psu_i2c_command_path = join(CONFIG_PATH, 'fan_command') + self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index)) + self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index)) + self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command') self.fan_status_path = "fan{}_fault".format(self.index) self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py index bc5a1bb68276..5d8a5821ad42 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_conditions.py @@ -110,7 +110,7 @@ class CoolingLevelChangeCondition(ThermalPolicyConditionBase): def is_match(self, thermal_info_dict): from .fan import Fan current_cooling_level = Fan.get_cooling_level() - if current_cooling_level != cooling_level: + if current_cooling_level != CoolingLevelChangeCondition.cooling_level: CoolingLevelChangeCondition.cooling_level = current_cooling_level return True else: From de6ca46b9db7d6d7e5254579683a95a9c9195f2d Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 3 Apr 2020 14:56:43 +0800 Subject: [PATCH 23/25] Add logs to thermal actions --- .../mlnx-platform-api/sonic_platform/thermal_actions.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index 3a364d594057..eaed99eca9ed 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -1,5 +1,6 @@ from sonic_platform_base.sonic_thermal_control.thermal_action_base import ThermalPolicyActionBase from sonic_platform_base.sonic_thermal_control.thermal_json_object import thermal_json_object +from .thermal import logger class SetFanSpeedAction(ThermalPolicyActionBase): @@ -52,6 +53,7 @@ def execute(self, thermal_info_dict): fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] for fan in fan_info_obj.get_presence_fans(): fan.set_speed(self.speed) + logger.log_info('Set all system FAN speed to {}'.format(self.speed)) @thermal_json_object('fan.all.check_and_set_speed') @@ -126,6 +128,8 @@ def execute(self, thermal_info_dict): break fan.set_speed(Fan.min_cooling_level * 10) + logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) + class ChangeMinCoolingLevelAction(ThermalPolicyActionBase): UNKNOWN_SKU_COOLING_LEVEL = 6 @@ -156,6 +160,8 @@ def execute(self, thermal_info_dict): if current_cooling_level < Fan.min_cooling_level: Fan.set_cooling_level(Fan.min_cooling_level) + logger.log_info('Changed minimum cooling level to {}'.format(Fan.min_cooling_level)) + class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): def execute(self, thermal_info_dict): @@ -167,3 +173,5 @@ def execute(self, thermal_info_dict): for psu in chassis.get_all_psus(): for psu_fan in psu.get_all_fans(): psu_fan.set_speed(cooling_level * 10) + + logger.log_info('Updated PSU FAN speed to {}%'.format(cooling_level * 10)) From c16bb1d8e7275f53695835f28158bc0f3c70de9b Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 3 Apr 2020 16:45:44 +0800 Subject: [PATCH 24/25] Update PSU fan speed whenever system fan speed or cooling level changed --- .../sonic_platform/thermal_actions.py | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py index eaed99eca9ed..dbb2e2e54059 100644 --- a/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py +++ b/platform/mellanox/mlnx-platform-api/sonic_platform/thermal_actions.py @@ -55,6 +55,20 @@ def execute(self, thermal_info_dict): fan.set_speed(self.speed) logger.log_info('Set all system FAN speed to {}'.format(self.speed)) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, self.speed) + + @classmethod + def set_psu_fan_speed(cls, thermal_info_dict, speed): + from .thermal_infos import ChassisInfo + if ChassisInfo.INFO_NAME in thermal_info_dict and isinstance(thermal_info_dict[ChassisInfo.INFO_NAME], ChassisInfo): + chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() + for psu in chassis.get_all_psus(): + for psu_fan in psu.get_all_fans(): + psu_fan.set_speed(speed) + + logger.log_info('Updated PSU FAN speed to {}%'.format(speed)) + + @thermal_json_object('fan.all.check_and_set_speed') class CheckAndSetAllFanSpeedAction(SetAllFanSpeedAction): @@ -123,10 +137,16 @@ def execute(self, thermal_info_dict): # save power if Thermal.check_thermal_zone_temperature(): fan_info_obj = thermal_info_dict[FanInfo.INFO_NAME] + update_psu_fan_speed = False + speed = Fan.min_cooling_level * 10 for fan in fan_info_obj.get_presence_fans(): if fan.get_target_speed() != 100: break - fan.set_speed(Fan.min_cooling_level * 10) + update_psu_fan_speed = True + fan.set_speed(speed) + + if update_psu_fan_speed: + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, speed) logger.log_info('Changed thermal algorithm status to {}'.format(self.status)) @@ -159,6 +179,7 @@ def execute(self, thermal_info_dict): current_cooling_level = Fan.get_cooling_level() if current_cooling_level < Fan.min_cooling_level: Fan.set_cooling_level(Fan.min_cooling_level) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, Fan.min_cooling_level * 10) logger.log_info('Changed minimum cooling level to {}'.format(Fan.min_cooling_level)) @@ -166,12 +187,4 @@ def execute(self, thermal_info_dict): class UpdatePsuFanSpeedAction(ThermalPolicyActionBase): def execute(self, thermal_info_dict): from .thermal_conditions import CoolingLevelChangeCondition - from .thermal_infos import ChassisInfo - - chassis = thermal_info_dict[ChassisInfo.INFO_NAME].get_chassis() - cooling_level = CoolingLevelChangeCondition.cooling_level - for psu in chassis.get_all_psus(): - for psu_fan in psu.get_all_fans(): - psu_fan.set_speed(cooling_level * 10) - - logger.log_info('Updated PSU FAN speed to {}%'.format(cooling_level * 10)) + SetAllFanSpeedAction.set_psu_fan_speed(thermal_info_dict, CoolingLevelChangeCondition.cooling_level * 10) From ab9132fa3b59432cb91221259125596006160721 Mon Sep 17 00:00:00 2001 From: junchao Date: Fri, 3 Apr 2020 16:49:31 +0800 Subject: [PATCH 25/25] fix unit test failure --- platform/mellanox/mlnx-platform-api/tests/mock_platform.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py index ff8adb66ac91..c53480584889 100644 --- a/platform/mellanox/mlnx-platform-api/tests/mock_platform.py +++ b/platform/mellanox/mlnx-platform-api/tests/mock_platform.py @@ -28,6 +28,9 @@ def get_presence(self): def get_powergood_status(self): return self.powergood + def get_all_fans(self): + return [] + class MockChassis: def __init__(self):