Skip to content

Commit 214541b

Browse files
santhosh-ktabdosi
authored andcommitted
[DellEMC] S6000 - Thermal support - Last Reboot Reason (#4097)
- Added support for Thermal event in Last Reboot Reason "show reboot-cause" command. - Added support for sending log message in case of thermal shutdown. sonic NOTICE root: Shutting down due to over temperature (40 degree, 30 degree, 34 degree)
1 parent c34dcbe commit 214541b

File tree

2 files changed

+143
-45
lines changed

2 files changed

+143
-45
lines changed

platform/broadcom/sonic-platform-modules-dell/s6000/scripts/fancontrol.sh

Lines changed: 109 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/bin/bash
2+
trap 'cleanup' 0 1 2 3 6 9 10 11 13 15
23

34
LEVEL=99
45
INTERVAL=5
@@ -14,11 +15,27 @@ LEVEL3=16000
1415
LEVEL4=19000
1516
LEVEL5=19000
1617

18+
LRR_FILE="/host/reboot-cause/reboot-cause.txt"
1719
I2C_ADAPTER="/sys/class/i2c-adapter/i2c-2/i2c-11"
20+
1821
SENSOR1="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_input"
1922
SENSOR2="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_input"
2023
SENSOR3="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_input"
2124

25+
SENSOR1_MAX="$I2C_ADAPTER/11-004c/hwmon/hwmon*/temp1_max"
26+
SENSOR2_MAX="$I2C_ADAPTER/11-004d/hwmon/hwmon*/temp1_max"
27+
SENSOR3_MAX="$I2C_ADAPTER/11-004e/hwmon/hwmon*/temp1_max"
28+
29+
SENSOR1_MAX_VAL=$(cat $SENSOR1_MAX)
30+
SENSOR2_MAX_VAL=$(cat $SENSOR2_MAX)
31+
SENSOR3_MAX_VAL=$(cat $SENSOR3_MAX)
32+
33+
# Reducing by 63 to differentiate this temperature settings
34+
# from pmon sensors configuration settings
35+
SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_MAX_VAL` + 5000 - 63)
36+
SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_MAX_VAL` + 5000 - 63)
37+
SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_MAX_VAL` + 5000 - 63)
38+
2239
# Three fan trays with each contains two separate fans
2340
# fan1-fan4 fan2-fan5 fan3-fan6
2441
FANTRAY1_FAN1=$I2C_ADAPTER/11-0029/fan1_target
@@ -46,6 +63,14 @@ function check_module
4663
fi
4764
}
4865

66+
function cleanup
67+
{
68+
echo $SENSOR1_MAX_VAL > $SENSOR1_MAX
69+
echo $SENSOR2_MAX_VAL > $SENSOR2_MAX
70+
echo $SENSOR3_MAX_VAL > $SENSOR3_MAX
71+
exit 1
72+
}
73+
4974
function check_faulty_fan
5075
{
5176

@@ -123,56 +148,95 @@ function update_fan_speed
123148

124149
function monitor_temp_sensors
125150
{
151+
SENSOR1_CUR_MAX_VAL=$(cat $SENSOR1_MAX)
152+
SENSOR2_CUR_MAX_VAL=$(cat $SENSOR2_MAX)
153+
SENSOR3_CUR_MAX_VAL=$(cat $SENSOR3_MAX)
154+
if [ "$SENSOR1_CUR_MAX_VAL" -ne "$SENSOR1_NEW_MAX" ]
155+
then
156+
SENSOR1_NEW_MAX=$(expr `echo $SENSOR1_CUR_MAX_VAL` + 5000 - 63)
157+
SENSOR1_MAX_VAL=$SENSOR1_CUR_MAX_VAL
158+
echo $SENSOR1_NEW_MAX > $SENSOR1_MAX
159+
fi
160+
if [ "$SENSOR2_CUR_MAX_VAL" -ne "$SENSOR2_NEW_MAX" ]
161+
then
162+
SENSOR2_NEW_MAX=$(expr `echo $SENSOR2_CUR_MAX_VAL` + 5000 - 63)
163+
SENSOR2_MAX_VAL=$SENSOR2_CUR_MAX_VAL
164+
echo $SENSOR2_NEW_MAX > $SENSOR2_MAX
165+
fi
166+
if [ "$SENSOR3_CUR_MAX_VAL" -ne "$SENSOR3_NEW_MAX" ]
167+
then
168+
SENSOR3_NEW_MAX=$(expr `echo $SENSOR3_CUR_MAX_VAL` + 5000 - 63)
169+
SENSOR3_MAX_VAL=$SENSOR3_CUR_MAX_VAL
170+
echo $SENSOR3_NEW_MAX > $SENSOR3_MAX
171+
fi
126172

127-
while true # go through all temp sensor outputs
128-
do
129-
sensor1=$(expr `echo $(cat $SENSOR1)` / 1000)
130-
sensor2=$(expr `echo $(cat $SENSOR2)` / 1000)
131-
sensor3=$(expr `echo $(cat $SENSOR3)` / 1000)
132-
sum=$(($sensor1 + $sensor2 + $sensor3))
133-
sensor_temp=$(($sum/3))
134-
135-
if [ "$sensor_temp" -le "25" ] && [ "$LEVEL" -ne "0" ]
136-
then
137-
# Set Fan Speed to 7000 RPM"
138-
LEVEL=0
139-
update_fan_speed $IDLE
140-
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
141-
142-
elif [ "$sensor_temp" -ge "26" ] && [ "$sensor_temp" -le "44" ] && [ "$LEVEL" -ne "1" ]
143-
then
144-
# Set Fan Speed to 10000 RPM"
145-
LEVEL=1
146-
update_fan_speed $LEVEL1
147-
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
148-
149-
elif [ "$sensor_temp" -ge "45" ] && [ "$sensor_temp" -le "59" ] && [ "$LEVEL" -ne "2" ]
150-
then
151-
# Set Fan Speed to 13000 RPM"
152-
LEVEL=2
153-
update_fan_speed $LEVEL2
154-
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
155-
156-
elif [ "$sensor_temp" -ge "60" ] && [ "$sensor_temp" -le "79" ] && [ "$LEVEL" -ne "3" ]
157-
then
158-
# Set Fan Speed to 16000 RPM"
159-
LEVEL=3
160-
update_fan_speed $LEVEL3
161-
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
162-
163-
elif [ "$sensor_temp" -ge "80" ] && [ "$LEVEL" -ne "4" ]
164-
then
165-
# Set Fan Speed to 19000 RPM"
166-
LEVEL=4
167-
update_fan_speed $LEVEL4
168-
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
169-
fi
173+
# go through all temp sensor outputs
174+
sensor1=$(expr `echo $(cat $SENSOR1)` / 1000)
175+
sensor2=$(expr `echo $(cat $SENSOR2)` / 1000)
176+
sensor3=$(expr `echo $(cat $SENSOR3)` / 1000)
177+
# All sensors output in 1000's
178+
s1=$(cat $SENSOR1)
179+
s2=$(cat $SENSOR2)
180+
s3=$(cat $SENSOR3)
181+
182+
if [ "$s1" -ge "$SENSOR1_MAX_VAL" ] || [ "$s2" -ge "$SENSOR2_MAX_VAL" ] || [ "$s3" -ge "$SENSOR3_MAX_VAL" ]
183+
then
184+
# Thermal trip is about to happen
185+
echo "Thermal Overload $sensor1 $sensor2 $sensor3" > $LRR_FILE
186+
logger "Shutting down due to over temperature ($sensor1 degree, $sensor2 degree, $sensor3 degree)"
187+
sync
188+
sleep 1 # Give time to send logger message to server
189+
# Assigning the original max values back in sensors
190+
echo $SENSOR1_MAX_VAL > $SENSOR1_MAX
191+
echo $SENSOR2_MAX_VAL > $SENSOR2_MAX
192+
echo $SENSOR3_MAX_VAL > $SENSOR3_MAX
193+
194+
### Not Reached ###
195+
# In case if HW fails to shutdown
196+
/sbin/shutdown -P now
197+
fi
198+
sum=$(($sensor1 + $sensor2 + $sensor3))
199+
sensor_temp=$(($sum/3))
200+
201+
if [ "$sensor_temp" -le "25" ] && [ "$LEVEL" -ne "0" ]
202+
then
203+
# Set Fan Speed to 7000 RPM"
204+
LEVEL=0
205+
update_fan_speed $IDLE
206+
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
207+
208+
elif [ "$sensor_temp" -ge "26" ] && [ "$sensor_temp" -le "44" ] && [ "$LEVEL" -ne "1" ]
209+
then
210+
# Set Fan Speed to 10000 RPM"
211+
LEVEL=1
212+
update_fan_speed $LEVEL1
213+
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
214+
215+
elif [ "$sensor_temp" -ge "45" ] && [ "$sensor_temp" -le "59" ] && [ "$LEVEL" -ne "2" ]
216+
then
217+
# Set Fan Speed to 13000 RPM"
218+
LEVEL=2
219+
update_fan_speed $LEVEL2
220+
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
221+
222+
elif [ "$sensor_temp" -ge "60" ] && [ "$sensor_temp" -le "79" ] && [ "$LEVEL" -ne "3" ]
223+
then
224+
# Set Fan Speed to 16000 RPM"
225+
LEVEL=3
226+
update_fan_speed $LEVEL3
227+
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
228+
229+
elif [ "$sensor_temp" -ge "80" ] && [ "$LEVEL" -ne "4" ]
230+
then
231+
# Set Fan Speed to 19000 RPM"
232+
LEVEL=4
233+
update_fan_speed $LEVEL4
234+
logger "Adjusted FAN Speed to $IDLE RPM against $sensor_temp Temperature"
235+
fi
170236

171237
# Check for faulty fan
172238
check_faulty_fan
173239

174-
done
175-
176240
}
177241

178242
# Check drivers for sysfs attributes

platform/broadcom/sonic-platform-modules-dell/s6000/sonic_platform/chassis.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import os
1212
import time
1313
import datetime
14+
import struct
1415
import subprocess
1516
from sonic_platform_base.chassis_base import ChassisBase
1617
from sonic_platform.sfp import Sfp
@@ -41,6 +42,7 @@ class Chassis(ChassisBase):
4142
reset_reason_dict = {}
4243
reset_reason_dict[0xe] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE
4344
reset_reason_dict[0x6] = ChassisBase.REBOOT_CAUSE_NON_HARDWARE
45+
reset_reason_dict[0x7] = ChassisBase.REBOOT_CAUSE_THERMAL_OVERLOAD_OTHER
4446

4547
def __init__(self):
4648
ChassisBase.__init__(self)
@@ -100,6 +102,36 @@ def _get_cpld_register(self, reg_name):
100102
rv = rv.lstrip(" ")
101103
return rv
102104

105+
def _nvram_write(self, offset, val):
106+
resource = "/dev/nvram"
107+
fd = os.open(resource, os.O_RDWR)
108+
if (fd < 0):
109+
print('File open failed ',resource)
110+
return
111+
if (os.lseek(fd, offset, os.SEEK_SET) != offset):
112+
print('lseek failed on ',resource)
113+
return
114+
ret = os.write(fd, struct.pack('B', val))
115+
if ret != 1:
116+
print('Write failed ',str(ret))
117+
return
118+
os.close(fd)
119+
120+
def _get_thermal_reset(self):
121+
reset_file = "/host/reboot-cause/reboot-cause.txt"
122+
if (not os.path.isfile(reset_file)):
123+
return False
124+
try:
125+
with open(reset_file, 'r') as fd:
126+
rv = fd.read()
127+
except Exception as error:
128+
return False
129+
130+
if "Thermal Overload" in rv:
131+
return True
132+
133+
return False
134+
103135
def get_name(self):
104136
"""
105137
Retrieves the name of the chassis
@@ -181,6 +213,8 @@ def get_reboot_cause(self):
181213
# NVRAM. Only Warmboot and Coldboot reason are supported here.
182214
# Since it does not support any hardware reason, we return
183215
# non_hardware as default
216+
if self._get_thermal_reset() == True:
217+
self._nvram_write(0x49, 0x7)
184218

185219
lrr = self._get_cpld_register('last_reboot_reason')
186220
if (lrr != 'ERR'):

0 commit comments

Comments
 (0)