Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mellanox] Adjust thermal control policies #8

Closed
wants to merge 26 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
45f1229
[thermal control] Fix pmon docker stop issue on 3800
Junchao-Mellanox Feb 18, 2020
23bf172
[thermal fix] Fix QA test issue
Junchao-Mellanox Feb 21, 2020
a1aaa93
Fix thermal control issues
stephenxs Dec 4, 2019
7f341f9
[thermal fix] change psu._get_power_available_status to psu.get_power…
Junchao-Mellanox Feb 25, 2020
3206e95
[thermal fix] adjust log for PSU absence and power absence
Junchao-Mellanox Feb 28, 2020
4e689ac
[thermal fix] add unit test for loading thermal policy file with dupl…
Junchao-Mellanox Feb 28, 2020
0b45c45
[thermal] fix fan.get_presence for non-removable SKU
Junchao-Mellanox Mar 10, 2020
bdfc652
[thermal fix] fix issue: fan direction is based on drawer
Junchao-Mellanox Mar 10, 2020
80e0b88
Fix issue: when fan is not present, should not read fan direction fro…
Junchao-Mellanox Mar 10, 2020
5b46d6f
[thermal fix] add unit test for get_direction for absent FAN
Junchao-Mellanox Mar 11, 2020
2d42cd3
Unplugable PSU has no FAN, no need add a FAN object for this PSU
Junchao-Mellanox Mar 16, 2020
6f80098
1. Enable thermal alogrithm by default; 2. set cooling level before s…
Junchao-Mellanox Mar 18, 2020
d891200
start thermal algorithm should also check thermal zone temperature
Junchao-Mellanox Mar 18, 2020
71c3665
Should write string to file
Junchao-Mellanox Mar 19, 2020
f409195
We should force enable or disable thermal algo when thermal control d…
Junchao-Mellanox Mar 19, 2020
12a9d8f
Change thermal algorithm status should also change thermal zone policy
Junchao-Mellanox Mar 19, 2020
527679a
Merge remote-tracking branch 'origin/master' into thermal-algo-fix
Junchao-Mellanox Mar 26, 2020
766aff0
Add fan speed dynamic minimum value
Junchao-Mellanox Mar 26, 2020
1bcec63
Add unit test for DynamicMinCoolingLevelPolicy
Junchao-Mellanox Mar 26, 2020
c6ed366
If current cooling state below minimum cooling state, set it to minim…
Junchao-Mellanox Mar 27, 2020
27fdea4
Enable changing PSU fan speed
Junchao-Mellanox Apr 1, 2020
f61a363
install i2c-tool in pmon docker
Junchao-Mellanox Apr 1, 2020
b285bc8
fix issue found in manual test
Junchao-Mellanox Apr 2, 2020
de6ca46
Add logs to thermal actions
Junchao-Mellanox Apr 3, 2020
c16bb1d
Update PSU fan speed whenever system fan speed or cooling level changed
Junchao-Mellanox Apr 3, 2020
ab9132f
fix unit test failure
Junchao-Mellanox Apr 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions device/mellanox/x86_64-mlnx_msn2700-r0/thermal_policy.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"thermal_control_algorithm": {
"run_at_boot_up": "false",
"run_at_boot_up": "true",
"fan_speed_when_suspend": "60"
},
"info_types": [
Expand Down Expand Up @@ -51,6 +51,24 @@
}
]
},
{
"name": "any fan broken",
"conditions": [
{
"type": "fan.any.fault"
}
],
"actions": [
{
"type": "thermal_control.control",
"status": "false"
},
{
"type": "fan.all.set_speed",
"speed": "100"
}
]
},
{
"name": "all fan and psu presence",
"conditions": [
Expand All @@ -59,12 +77,15 @@
},
{
"type": "psu.all.presence"
},
{
"type": "fan.all.good"
}
],
"actions": [
{
"type": "fan.all.set_speed",
"speed": "60"
"type": "thermal_control.control",
"status": "true"
}
]
}
Expand Down
3 changes: 2 additions & 1 deletion dockers/docker-platform-monitor/Dockerfile.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ RUN apt-get update && \
rrdtool \
python-smbus \
ethtool \
dmidecode
dmidecode \
i2c-tools

{% if docker_platform_monitor_debs.strip() -%}
# Copy locally-built Debian package dependencies
Expand Down
134 changes: 134 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
DEVICE_DATA = {
'ACS-MSN2700': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'LS-SN2700': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'ACS-MSN2740': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":13},
"p2c_untrust": {"-127:35":13, "36:40":14 , "41:120":15},
"c2p_trust": {"-127:120":13},
"c2p_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17},
"unk_trust": {"-127:120":13},
"unk_untrust": {"-127:15":13, "16:30":14 , "31:35":15, "36:120":17},
}
}
},
'ACS-MSN2410': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'Mellanox-SN2700': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'Mellanox-SN2700-D48C8': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:40":13, "41:120":15},
"p2c_untrust": {"-127:25":13, "26:30":14 , "31:35":15, "36:120":16},
"c2p_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"c2p_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_trust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16},
"unk_untrust": {"-127:20":13, "21:25":14 , "26:30":15, "31:120":16}
}
}
},
'ACS-MSN2100': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":12},
"p2c_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16},
"c2p_trust": {"-127:40":12, "41:120":13},
"c2p_untrust": {"-127:40":12, "41:120":13},
"unk_trust": {"-127:40":12, "41:120":13},
"unk_untrust": {"-127:15":12, "16:25":13, "26:30":14, "31:35":15, "36:120":16}
}
}
},
'ACS-MSN2010': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:120":12},
"p2c_untrust": {"-127:15":12, "16:20":13, "21:30":14, "31:35":15, "36:120":16},
"c2p_trust": {"-127:120":12},
"c2p_untrust": {"-127:20":12, "21:25":13 , "26:30":14, "31:35":15, "36:120":16},
"unk_trust": {"-127:120":12},
"unk_untrust": {"-127:15":12, "16:20":13 , "21:30":14, "31:35":15, "36:120":16}
}
}
},
'ACS-MSN3700': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"p2c_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
"c2p_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"c2p_untrust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_trust": {"-127:25":12, "26:40":13 , "41:120":14},
"unk_untrust": {"-127:15":12, "16:30":13 , "31:35":14, "36:40":15, "41:120":16},
}
}
},
'ACS-MSN3800': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:35":12, "36:120":13},
"p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
"c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16},
"unk_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
}
}
},
'Mellanox-SN3800-D112C8': {
'thermal': {
'minimum_table': {
"p2c_trust": {"-127:35":12, "36:120":13},
"p2c_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
"c2p_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"c2p_untrust": {"-127:20":12, "21:30":13 , "31:35":14, "36:40":15, "41:120":16},
"unk_trust": {"-127:30":12, "31:40":13 , "41:120":14},
"unk_untrust": {"-127:0":12, "1:10":13 , "11:15":14, "16:20":15, "21:35":16, "36:120":17},
}
}
},
}
71 changes: 67 additions & 4 deletions platform/mellanox/mlnx-platform-api/sonic_platform/fan.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#############################################################################

import os.path
import subprocess

try:
from sonic_platform_base.fan_base import FanBase
Expand All @@ -22,8 +23,14 @@

FAN_PATH = "/var/run/hw-management/thermal/"
LED_PATH = "/var/run/hw-management/led/"
CONFIG_PATH = "/var/run/hw-management/config"
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
FAN_DIR = "/var/run/hw-management/system/fan_dir"
COOLING_STATE_PATH = "/var/run/hw-management/thermal/cooling_cur_state"

# SKUs with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']

# SKUs with unplugable FANs:
# 1. don't have fanX_status and should be treated as always present
Expand All @@ -33,6 +40,10 @@ class Fan(FanBase):
"""Platform-specific Fan class"""

STATUS_LED_COLOR_ORANGE = "orange"
min_cooling_level = 2
# PSU fan speed vector
PSU_FAN_SPEED = ['0x3c', '0x3c', '0x3c', '0x3c', '0x3c',
'0x3c', '0x3c', '0x46', '0x50', '0x5a', '0x64']

def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
# API index is starting from 0, Mellanox platform index is starting from 1
Expand All @@ -54,6 +65,10 @@ def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sk
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
self.fan_max_speed_path = None
self.psu_i2c_bus_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_bus'.format(self.index))
self.psu_i2c_addr_path = os.path.join(CONFIG_PATH, 'psu{0}_i2c_addr'.format(self.index))
self.psu_i2c_command_path = os.path.join(CONFIG_PATH, 'fan_command')

self.fan_status_path = "fan{}_fault".format(self.index)
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index)
Expand Down Expand Up @@ -231,13 +246,28 @@ def set_speed(self, speed):
bool: True if set success, False if fail.
"""
status = True
pwm = int(round(PWM_MAX*speed/100.0))

if self.is_psu_fan:
#PSU fan speed is not setable.
return False

try:
with open(self.psu_i2c_bus_path, 'r') as f:
bus = f.read().strip()
with open(self.psu_i2c_addr_path, 'r') as f:
addr = f.read().strip()
with open(self.psu_i2c_command_path, 'r') as f:
command = f.read().strip()
speed = Fan.PSU_FAN_SPEED[int(speed / 10)]
subprocess.call("i2cset -f -y {0} {1} {2} {3} wp".format(bus, addr, command, speed), shell = True)
return True
except Exception as e:
return False

try:
cooling_level = int(speed / 10)
if cooling_level < self.min_cooling_level:
cooling_level = self.min_cooling_level
speed = self.min_cooling_level * 10
self.set_cooling_level(cooling_level)
pwm = int(round(PWM_MAX*speed/100.0))
with open(os.path.join(FAN_PATH, self.fan_speed_set_path), 'w') as fan_pwm:
fan_pwm.write(str(pwm))
except (ValueError, IOError):
Expand Down Expand Up @@ -352,3 +382,36 @@ def get_speed_tolerance(self):
"""
# The tolerance value is fixed as 20% for all the Mellanox platform
return 20

@classmethod
def set_cooling_level(cls, level):
"""
Change cooling level. The input level should be an integer value [1, 10].
1 means 10%, 2 means 20%, 10 means 100%.
"""
if not isinstance(level, int):
raise RuntimeError("Failed to set cooling level, input parameter must be integer")

if level < 1 or level > 10:
raise RuntimeError("Failed to set cooling level, level value must be in range [1, 10], got {}".format(level))

try:
# reset FAN driver and change cooling state
with open(COOLING_STATE_PATH, 'w') as cooling_state:
cooling_state.write(str(level + 10))

# make cooling state diplay correct value
with open(COOLING_STATE_PATH, 'w') as cooling_state:
cooling_state.write(str(level))
except (ValueError, IOError) as e:
raise RuntimeError("Failed to set cooling level - {}".format(e))

@classmethod
def get_cooling_level(cls):
try:
with open(COOLING_STATE_PATH, 'r') as cooling_state:
cooling_level = int(cooling_state.read())
return cooling_level
except (ValueError, IOError) as e:
raise RuntimeError("Failed to get cooling level - {}".format(e))

Loading