Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class Chassis(ChassisBase):
# System UID LED
_led_uid = None

chassis_instance = None

def __init__(self):
super(Chassis, self).__init__()

Expand Down Expand Up @@ -127,6 +129,8 @@ def __init__(self):
self._RJ45_port_inited = False
self._RJ45_port_list = None

Chassis.chassis_instance = self

self.modules_mgmt_thread = threading.Thread()
self.modules_changes_queue = queue.Queue()
self.modules_mgmt_task_stopping_event = threading.Event()
Expand Down
79 changes: 78 additions & 1 deletion platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from . import utils
from .device_data import DeviceDataManager
from sonic_platform_base.sonic_xcvr.sfp_optoe_base import SfpOptoeBase
from sonic_platform_base.sonic_xcvr.fields import consts
from sonic_platform_base.sonic_xcvr.api.public import sff8636, sff8436

except ImportError as e:
raise ImportError (str(e) + "- required module not found")
Expand Down Expand Up @@ -155,6 +157,10 @@
# SFP stderr
SFP_EEPROM_NOT_AVAILABLE = 'Input/output error'

SFP_DEFAULT_TEMP_WARNNING_THRESHOLD = 70.0
SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD = 80.0
SFP_TEMPERATURE_SCALE = 8.0

# SFP EEPROM limited bytes
limited_eeprom = {
SFP_TYPE_CMIS: {
Expand Down Expand Up @@ -264,7 +270,7 @@ def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, l

if slot_id == 0: # For non-modular chassis
from .thermal import initialize_sfp_thermal
self._thermal_list = initialize_sfp_thermal(sfp_index)
self._thermal_list = initialize_sfp_thermal(self)
else: # For modular chassis
# (slot_id % MAX_LC_CONUNT - 1) * MAX_PORT_COUNT + (sfp_index + 1) * (MAX_PORT_COUNT / LC_PORT_COUNT)
max_linecard_count = DeviceDataManager.get_linecard_count()
Expand Down Expand Up @@ -822,6 +828,77 @@ def get_tx_fault(self):
api = self.get_xcvr_api()
return [False] * api.NUM_CHANNELS if api else None

def get_temperature(self):
try:
if not self.is_sw_control():
temp_file = f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/input'
if not os.path.exists(temp_file):
logger.log_error(f'Failed to read from file {temp_file} - not exists')
return None
temperature = utils.read_int_from_file(temp_file,
log_func=None)
return temperature / SFP_TEMPERATURE_SCALE if temperature is not None else None
except:
return 0.0

self.reinit()
temperature = super().get_temperature()
return temperature if temperature is not None else None

def get_temperature_warning_threashold(self):
"""Get temperature warning threshold

Returns:
int: temperature warning threshold
"""
try:
if not self.is_sw_control():
emergency = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/emergency',
log_func=None,
default=None)
return emergency / SFP_TEMPERATURE_SCALE if emergency is not None else SFP_DEFAULT_TEMP_WARNNING_THRESHOLD
except:
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_WARNING_FIELD in thresh:
return thresh[consts.TEMP_HIGH_WARNING_FIELD]
return SFP_DEFAULT_TEMP_WARNNING_THRESHOLD

def get_temperature_critical_threashold(self):
"""Get temperature critical threshold

Returns:
int: temperature critical threshold
"""
try:
if not self.is_sw_control():
critical = utils.read_int_from_file(f'/sys/module/sx_core/asic0/module{self.sdk_index}/temperature/critical',
log_func=None,
default=None)
return critical / SFP_TEMPERATURE_SCALE if critical is not None else SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD
except:
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD

thresh = self._get_temperature_threshold()
if thresh and consts.TEMP_HIGH_ALARM_FIELD in thresh:
return thresh[consts.TEMP_HIGH_ALARM_FIELD]
return SFP_DEFAULT_TEMP_CRITICAL_THRESHOLD

def _get_temperature_threshold(self):
self.reinit()
api = self.get_xcvr_api()
if not api:
return None

thresh_support = api.get_transceiver_thresholds_support()
if thresh_support:
if isinstance(api, sff8636.Sff8636Api) or isinstance(api, sff8436.Sff8436Api):
return api.xcvr_eeprom.read(consts.TEMP_THRESHOLDS_FIELD)
return api.xcvr_eeprom.read(consts.THRESHOLDS_FIELD)
else:
return None

def get_xcvr_api(self):
"""
Retrieves the XcvrApi associated with this SFP
Expand Down
124 changes: 97 additions & 27 deletions platform/mellanox/mlnx-platform-api/sonic_platform/thermal.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
# Global logger class instance
logger = Logger()

DEFAULT_TEMP_SCALE = 1000

"""
The most important information for creating a Thermal object is 3 sysfs files: temperature file, high threshold file and
high critical threshold file. There is no common naming rule for thermal objects on Nvidia platform. There are two types
Expand Down Expand Up @@ -72,9 +74,11 @@
"chassis thermals": [
{
"name": "ASIC",
"temperature": "asic",
"high_threshold": "mlxsw/temp_trip_hot",
"high_critical_threshold": "mlxsw/temp_trip_crit"
"temperature": "input",
"high_threshold_default": 105,
"high_critical_threshold_default": 120,
"sysfs_folder": "/sys/module/sx_core/asic0/temperature",
"scale": 8
},
{
"name": "Ambient Port Side Temp",
Expand Down Expand Up @@ -187,8 +191,8 @@ def initialize_psu_thermal(psu_index, presence_cb):
return [create_indexable_thermal(THERMAL_NAMING_RULE['psu thermals'], psu_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1, presence_cb)]


def initialize_sfp_thermal(sfp_index):
return [create_indexable_thermal(THERMAL_NAMING_RULE['sfp thermals'], sfp_index, CHASSIS_THERMAL_SYSFS_FOLDER, 1)]
def initialize_sfp_thermal(sfp):
return [ModuleThermal(sfp)]


def initialize_linecard_thermals(lc_name, lc_index):
Expand All @@ -214,6 +218,7 @@ def initialize_linecard_sfp_thermal(lc_name, lc_index, sfp_index):
def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=None):
index += rule.get('start_index', 1)
name = rule['name'].format(index)
sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
temp_file = os.path.join(sysfs_folder, rule['temperature'].format(index))
_check_thermal_sysfs_existence(temp_file)
if 'high_threshold' in rule:
Expand All @@ -226,10 +231,13 @@ def create_indexable_thermal(rule, index, sysfs_folder, position, presence_cb=No
_check_thermal_sysfs_existence(high_crit_th_file)
else:
high_crit_th_file = None
high_th_default = rule.get('high_threshold_default')
high_crit_th_default = rule.get('high_critical_threshold_default')
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
if not presence_cb:
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position)
return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
else:
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb)
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)


def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
Expand All @@ -243,6 +251,7 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
elif not default_present:
return None

sysfs_folder = rule.get('sysfs_folder', sysfs_folder)
temp_file = os.path.join(sysfs_folder, temp_file)
_check_thermal_sysfs_existence(temp_file)
if 'high_threshold' in rule:
Expand All @@ -255,11 +264,14 @@ def create_single_thermal(rule, sysfs_folder, position, presence_cb=None):
_check_thermal_sysfs_existence(high_crit_th_file)
else:
high_crit_th_file = None
high_th_default = rule.get('high_threshold_default')
high_crit_th_default = rule.get('high_critical_threshold_default')
scale = rule.get('scale', DEFAULT_TEMP_SCALE)
name = rule['name']
if not presence_cb:
return Thermal(name, temp_file, high_th_file, high_crit_th_file, position)
return Thermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
else:
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, position, presence_cb)
return RemovableThermal(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb)


def _check_thermal_sysfs_existence(file_path):
Expand All @@ -268,17 +280,7 @@ def _check_thermal_sysfs_existence(file_path):


class Thermal(ThermalBase):
thermal_algorithm_status = False
# Expect cooling level, used for caching the cooling level value before commiting to hardware
expect_cooling_level = None
# Expect cooling state
expect_cooling_state = None
# Last committed cooling level
last_set_cooling_level = None
last_set_cooling_state = None
last_set_psu_cooling_level = None

def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position):
"""
index should be a string for category ambient and int for other categories
"""
Expand All @@ -288,6 +290,9 @@ def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position):
self.temperature = temp_file
self.high_threshold = high_th_file
self.high_critical_threshold = high_crit_th_file
self.high_th_default = high_th_default
self.high_crit_th_default = high_crit_th_default
self.scale = scale

def get_name(self):
"""
Expand All @@ -307,7 +312,7 @@ def get_temperature(self):
of one degree Celsius, e.g. 30.125
"""
value = utils.read_float_from_file(self.temperature, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None
return value / self.scale if (value is not None and value != 0) else None

def get_high_threshold(self):
"""
Expand All @@ -318,9 +323,9 @@ def get_high_threshold(self):
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_threshold is None:
return None
return self.high_th_default
value = utils.read_float_from_file(self.high_threshold, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None
return value / self.scale if (value is not None and value != 0) else self.high_th_default

def get_high_critical_threshold(self):
"""
Expand All @@ -331,9 +336,9 @@ def get_high_critical_threshold(self):
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
if self.high_critical_threshold is None:
return None
return self.high_crit_th_default
value = utils.read_float_from_file(self.high_critical_threshold, None, log_func=logger.log_info)
return value / 1000.0 if (value is not None and value != 0) else None
return value / self.scale if (value is not None and value != 0) else self.high_crit_th_default

def get_position_in_parent(self):
"""
Expand All @@ -353,8 +358,8 @@ def is_replaceable(self):


class RemovableThermal(Thermal):
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, position, presence_cb):
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, position)
def __init__(self, name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position, presence_cb):
super(RemovableThermal, self).__init__(name, temp_file, high_th_file, high_crit_th_file, high_th_default, high_crit_th_default, scale, position)
self.presence_cb = presence_cb

def get_temperature(self):
Expand Down Expand Up @@ -398,3 +403,68 @@ def get_high_critical_threshold(self):
logger.log_debug("get_high_critical_threshold for {} failed due to {}".format(self.name, hint))
return None
return super(RemovableThermal, self).get_high_critical_threshold()


class ModuleThermal(ThermalBase):
def __init__(self, sfp):
"""
index should be a string for category ambient and int for other categories
"""
super(ModuleThermal, self).__init__()
self.name = f'xSFP module {sfp.sdk_index + 1} Temp'
self.sfp = sfp

def get_name(self):
"""
Retrieves the name of the device

Returns:
string: The name of the device
"""
return self.name

def get_temperature(self):
"""
Retrieves current temperature reading from thermal

Returns:
A float number of current temperature in Celsius up to nearest thousandth
of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature()

def get_high_threshold(self):
"""
Retrieves the high threshold temperature of thermal

Returns:
A float number, the high threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_warning_threashold()

def get_high_critical_threshold(self):
"""
Retrieves the high critical threshold temperature of thermal

Returns:
A float number, the high critical threshold temperature of thermal in Celsius
up to nearest thousandth of one degree Celsius, e.g. 30.125
"""
return self.sfp.get_temperature_critical_threashold()

def get_position_in_parent(self):
"""
Retrieves 1-based relative physical position in parent device
Returns:
integer: The 1-based relative physical position in parent device
"""
return 1

def is_replaceable(self):
"""
Indicate whether this device is replaceable.
Returns:
bool: True if it is replaceable.
"""
return False
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,36 @@
# limitations under the License.
#
from sonic_platform_base.sonic_thermal_control.thermal_manager_base import ThermalManagerBase
from . import thermal_updater
from .device_data import DeviceDataManager


class ThermalManager(ThermalManagerBase):
thermal_updater_task = None

@classmethod
def run_policy(cls, chassis):
pass

@classmethod
def initialize(cls):
"""
Initialize thermal manager, including register thermal condition types and thermal action types
and any other vendor specific initialization.
:return:
"""
if DeviceDataManager.is_independent_mode():
from .chassis import Chassis
cls.thermal_updater_task = thermal_updater.ThermalUpdater(Chassis.chassis_instance.get_all_sfps())
cls.thermal_updater_task.start()


@classmethod
def deinitialize(cls):
"""
Destroy thermal manager, including any vendor specific cleanup. The default behavior of this function
is a no-op.
:return:
"""
if DeviceDataManager.is_independent_mode():
cls.thermal_updater_task.stop()
Loading