Skip to content

Commit b8ea901

Browse files
Junchao-Mellanoxpphuchar
authored andcommitted
[Mellanox] Fix thermal control bugs (sonic-net#4298)
* [thermal control] Fix pmon docker stop issue on 3800 * [thermal fix] Fix QA test issue * [thermal fix] change psu._get_power_available_status to psu.get_power_available_status * [thermal fix] adjust log for PSU absence and power absence * [thermal fix] add unit test for loading thermal policy file with duplicate conditions in different policies * [thermal] fix fan.get_presence for non-removable SKU * [thermal fix] fix issue: fan direction is based on drawer * Fix issue: when fan is not present, should not read fan direction from sysfs but directly return N/A * [thermal fix] add unit test for get_direction for absent FAN * Unplugable PSU has no FAN, no need add a FAN object for this PSU * Update submodules
1 parent 6466930 commit b8ea901

File tree

15 files changed

+506
-51
lines changed

15 files changed

+506
-51
lines changed

platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from sonic_daemon_base.daemon_base import Logger
1616
from os import listdir
1717
from os.path import isfile, join
18+
from glob import glob
1819
import sys
1920
import io
2021
import re
@@ -34,6 +35,10 @@
3435

3536
HWMGMT_SYSTEM_ROOT = '/var/run/hw-management/system/'
3637

38+
MST_DEVICE_NAME_PATTERN = '/dev/mst/mt[0-9]*_pciconf0'
39+
MST_DEVICE_RE_PATTERN = '/dev/mst/mt([0-9]*)_pciconf0'
40+
SPECTRUM1_CHIP_ID = '52100'
41+
3742
#reboot cause related definitions
3843
REBOOT_CAUSE_ROOT = HWMGMT_SYSTEM_ROOT
3944

@@ -93,11 +98,21 @@ def initialize_fan(self):
9398
num_of_fan, num_of_drawer = self._extract_num_of_fans_and_fan_drawers()
9499
multi_rotor_in_drawer = num_of_fan > num_of_drawer
95100

101+
# Fan's direction isn't supported on spectrum 1 devices for now
102+
mst_dev_list = glob(MST_DEVICE_NAME_PATTERN)
103+
if not mst_dev_list:
104+
raise RuntimeError("Can't get chip type due to {} not found".format(MST_DEVICE_NAME_PATTERN))
105+
m = re.search(MST_DEVICE_RE_PATTERN, mst_dev_list[0])
106+
if m.group(1) == SPECTRUM1_CHIP_ID:
107+
has_fan_dir = False
108+
else:
109+
has_fan_dir = True
110+
96111
for index in range(num_of_fan):
97112
if multi_rotor_in_drawer:
98-
fan = Fan(index, index/2)
113+
fan = Fan(has_fan_dir, index, index/2, False, self.sku_name)
99114
else:
100-
fan = Fan(index, index)
115+
fan = Fan(has_fan_dir, index, index, False, self.sku_name)
101116
self._fan_list.append(fan)
102117

103118

platform/mellanox/mlnx-platform-api/sonic_platform/fan.py

Lines changed: 131 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -15,23 +15,33 @@
1515
except ImportError as e:
1616
raise ImportError (str(e) + "- required module not found")
1717

18-
LED_ON = 1
19-
LED_OFF = 0
18+
LED_ON = '1'
19+
LED_OFF = '0'
2020

2121
PWM_MAX = 255
2222

2323
FAN_PATH = "/var/run/hw-management/thermal/"
2424
LED_PATH = "/var/run/hw-management/led/"
25+
# fan_dir isn't supported on Spectrum 1. It is supported on Spectrum 2 and later switches
26+
FAN_DIR = "/var/run/hw-management/system/fan_dir"
27+
28+
# SKUs with unplugable FANs:
29+
# 1. don't have fanX_status and should be treated as always present
30+
hwsku_dict_with_unplugable_fan = ['ACS-MSN2010', 'ACS-MSN2100']
2531

2632
class Fan(FanBase):
2733
"""Platform-specific Fan class"""
28-
def __init__(self, fan_index, drawer_index = 1, psu_fan = False):
34+
35+
STATUS_LED_COLOR_ORANGE = "orange"
36+
37+
def __init__(self, has_fan_dir, fan_index, drawer_index = 1, psu_fan = False, sku = None):
2938
# API index is starting from 0, Mellanox platform index is starting from 1
3039
self.index = fan_index + 1
3140
self.drawer_index = drawer_index + 1
3241

3342
self.is_psu_fan = psu_fan
34-
43+
self.always_presence = False if sku not in hwsku_dict_with_unplugable_fan else True
44+
3545
self.fan_min_speed_path = "fan{}_min".format(self.index)
3646
if not self.is_psu_fan:
3747
self.fan_speed_get_path = "fan{}_speed_get".format(self.index)
@@ -42,14 +52,53 @@ def __init__(self, fan_index, drawer_index = 1, psu_fan = False):
4252
else:
4353
self.fan_speed_get_path = "psu{}_fan1_speed_get".format(self.index)
4454
self.fan_presence_path = "psu{}_fan1_speed_get".format(self.index)
45-
self._name = 'psu_{}_fan_{}'.format(self.index, fan_index)
55+
self._name = 'psu_{}_fan_{}'.format(self.index, 1)
4656
self.fan_max_speed_path = None
4757
self.fan_status_path = "fan{}_fault".format(self.index)
4858
self.fan_green_led_path = "led_fan{}_green".format(self.drawer_index)
4959
self.fan_red_led_path = "led_fan{}_red".format(self.drawer_index)
5060
self.fan_orange_led_path = "led_fan{}_orange".format(self.drawer_index)
5161
self.fan_pwm_path = "pwm1"
5262
self.fan_led_cap_path = "led_fan{}_capability".format(self.drawer_index)
63+
if has_fan_dir:
64+
self.fan_dir = FAN_DIR
65+
else:
66+
self.fan_dir = None
67+
68+
69+
def get_direction(self):
70+
"""
71+
Retrieves the fan's direction
72+
73+
Returns:
74+
A string, either FAN_DIRECTION_INTAKE or FAN_DIRECTION_EXHAUST
75+
depending on fan direction
76+
77+
Notes:
78+
What Mellanox calls forward:
79+
Air flows from fans side to QSFP side, for example: MSN2700-CS2F
80+
which means intake in community
81+
What Mellanox calls reverse:
82+
Air flow from QSFP side to fans side, for example: MSN2700-CS2R
83+
which means exhaust in community
84+
According to hw-mgmt:
85+
1 stands for forward, in other words intake
86+
0 stands for reverse, in other words exhaust
87+
"""
88+
if not self.fan_dir or self.is_psu_fan or not self.get_presence():
89+
return self.FAN_DIRECTION_NOT_APPLICABLE
90+
91+
try:
92+
with open(os.path.join(self.fan_dir), 'r') as fan_dir:
93+
fan_dir_bits = int(fan_dir.read())
94+
fan_mask = 1 << self.drawer_index - 1
95+
if fan_dir_bits & fan_mask:
96+
return self.FAN_DIRECTION_INTAKE
97+
else:
98+
return self.FAN_DIRECTION_EXHAUST
99+
except (ValueError, IOError) as e:
100+
raise RuntimeError("Failed to read fan direction status to {}".format(repr(e)))
101+
53102

54103
def get_name(self):
55104
return self._name
@@ -63,15 +112,16 @@ def get_status(self):
63112
"""
64113
status = 0
65114
if self.is_psu_fan:
66-
status = 1
115+
status = 0
67116
else:
68117
try:
69118
with open(os.path.join(FAN_PATH, self.fan_status_path), 'r') as fault_status:
70119
status = int(fault_status.read())
71120
except (ValueError, IOError):
72-
status = 0
121+
status = 1
122+
123+
return status == 0
73124

74-
return status == 1
75125

76126
def get_presence(self):
77127
"""
@@ -87,14 +137,18 @@ def get_presence(self):
87137
else:
88138
status = 0
89139
else:
90-
try:
91-
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
92-
status = int(presence_status.read())
93-
except (ValueError, IOError):
94-
status = 0
140+
if self.always_presence:
141+
status = 1
142+
else:
143+
try:
144+
with open(os.path.join(FAN_PATH, self.fan_presence_path), 'r') as presence_status:
145+
status = int(presence_status.read())
146+
except (ValueError, IOError):
147+
status = 0
95148

96149
return status == 1
97-
150+
151+
98152
def _get_min_speed_in_rpm(self):
99153
speed = 0
100154
try:
@@ -104,7 +158,8 @@ def _get_min_speed_in_rpm(self):
104158
speed = 0
105159

106160
return speed
107-
161+
162+
108163
def _get_max_speed_in_rpm(self):
109164
speed = 0
110165
try:
@@ -115,6 +170,7 @@ def _get_max_speed_in_rpm(self):
115170

116171
return speed
117172

173+
118174
def get_speed(self):
119175
"""
120176
Retrieves the speed of fan
@@ -135,9 +191,12 @@ def get_speed(self):
135191

136192
max_speed_in_rpm = self._get_max_speed_in_rpm()
137193
speed = 100*speed_in_rpm/max_speed_in_rpm
194+
if speed > 100:
195+
speed = 100
138196

139197
return speed
140198

199+
141200
def get_target_speed(self):
142201
"""
143202
Retrieves the expected speed of fan
@@ -159,6 +218,7 @@ def get_target_speed(self):
159218

160219
return speed
161220

221+
162222
def set_speed(self, speed):
163223
"""
164224
Set fan speed to expected value
@@ -184,7 +244,8 @@ def set_speed(self, speed):
184244
status = False
185245

186246
return status
187-
247+
248+
188249
def _get_led_capability(self):
189250
cap_list = None
190251
try:
@@ -196,6 +257,7 @@ def _get_led_capability(self):
196257

197258
return cap_list
198259

260+
199261
def set_status_led(self, color):
200262
"""
201263
Set led to expected color
@@ -216,32 +278,70 @@ def set_status_led(self, color):
216278
return False
217279
status = False
218280
try:
219-
if color == 'green':
281+
if color == self.STATUS_LED_COLOR_GREEN:
220282
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'w') as fan_led:
221-
fan_led.write(str(LED_ON))
222-
elif color == 'red':
283+
fan_led.write(LED_ON)
284+
status = True
285+
elif color == self.STATUS_LED_COLOR_RED:
223286
# Some fan don't support red led but support orange led, in this case we set led to orange
224-
if 'red' in led_cap_list:
287+
if self.STATUS_LED_COLOR_RED in led_cap_list:
225288
led_path = os.path.join(LED_PATH, self.fan_red_led_path)
226-
elif 'orange' in led_cap_list:
289+
elif self.STATUS_LED_COLOR_ORANGE in led_cap_list:
227290
led_path = os.path.join(LED_PATH, self.fan_orange_led_path)
228291
else:
229292
return False
230293
with open(led_path, 'w') as fan_led:
231-
fan_led.write(str(LED_ON))
232-
233-
elif color == 'off':
234-
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'w') as fan_led:
235-
fan_led.write(str(LED_OFF))
236-
237-
with open(os.path.join(LED_PATH, self.fan_red_led_path), 'w') as fan_led:
238-
fan_led.write(str(LED_OFF))
294+
fan_led.write(LED_ON)
295+
status = True
296+
elif color == self.STATUS_LED_COLOR_OFF:
297+
if self.STATUS_LED_COLOR_GREEN in led_cap_list:
298+
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'w') as fan_led:
299+
fan_led.write(str(LED_OFF))
300+
if self.STATUS_LED_COLOR_RED in led_cap_list:
301+
with open(os.path.join(LED_PATH, self.fan_red_led_path), 'w') as fan_led:
302+
fan_led.write(str(LED_OFF))
303+
if self.STATUS_LED_COLOR_ORANGE in led_cap_list:
304+
with open(os.path.join(LED_PATH, self.fan_orange_led_path), 'w') as fan_led:
305+
fan_led.write(str(LED_OFF))
306+
307+
status = True
239308
else:
240309
status = False
241310
except (ValueError, IOError):
242-
status = False
311+
status = False
312+
243313
return status
244314

315+
316+
def get_status_led(self):
317+
"""
318+
Gets the state of the fan status LED
319+
320+
Returns:
321+
A string, one of the predefined STATUS_LED_COLOR_* strings above
322+
"""
323+
led_cap_list = self._get_led_capability()
324+
if led_cap_list is None:
325+
return self.STATUS_LED_COLOR_OFF
326+
327+
try:
328+
with open(os.path.join(LED_PATH, self.fan_green_led_path), 'r') as fan_led:
329+
if LED_OFF != fan_led.read().rstrip('\n'):
330+
return self.STATUS_LED_COLOR_GREEN
331+
if self.STATUS_LED_COLOR_RED in led_cap_list:
332+
with open(os.path.join(LED_PATH, self.fan_red_led_path), 'r') as fan_led:
333+
if LED_OFF != fan_led.read().rstrip('\n'):
334+
return self.STATUS_LED_COLOR_RED
335+
if self.STATUS_LED_COLOR_ORANGE in led_cap_list:
336+
with open(os.path.join(LED_PATH, self.fan_orange_led_path), 'r') as fan_led:
337+
if LED_OFF != fan_led.read().rstrip('\n'):
338+
return self.STATUS_LED_COLOR_RED
339+
except (ValueError, IOError) as e:
340+
raise RuntimeError("Failed to read led status for fan {} due to {}".format(self.index, repr(e)))
341+
342+
return self.STATUS_LED_COLOR_OFF
343+
344+
245345
def get_speed_tolerance(self):
246346
"""
247347
Retrieves the speed tolerance of the fan
@@ -251,4 +351,4 @@ def get_speed_tolerance(self):
251351
considered tolerable
252352
"""
253353
# The tolerance value is fixed as 20% for all the Mellanox platform
254-
return 20
354+
return 20

0 commit comments

Comments
 (0)