Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
From e588b87a2e80de284ea6e63563e27684acb2f56e Mon Sep 17 00:00:00 2001
From 4f6c7ddb8259791a87e67e97d8eb7e3223c53435 Mon Sep 17 00:00:00 2001
From: davidza <davidza@nvidia.com>
Date: Tue, 15 Oct 2024 09:44:05 +0300
Subject: [PATCH 1/3] Make SONiC determine-reboot-cause service start after
Subject: [PATCH 1/2] Make SONiC determine-reboot-cause service start after
hw-mgmt service

Signed-off-by: Kebo Liu <kebol@nvidia.com>
Expand All @@ -10,17 +10,17 @@ Signed-off-by: Kebo Liu <kebol@nvidia.com>
1 file changed, 1 insertion(+)

diff --git a/debian/hw-management.hw-management.service b/debian/hw-management.hw-management.service
index 4bc1780e..1a50dc3c 100755
index 8bdcaef5..1c25ffb2 100755
--- a/debian/hw-management.hw-management.service
+++ b/debian/hw-management.hw-management.service
@@ -2,6 +2,7 @@
@@ -1,6 +1,7 @@
[Unit]
Description=Chassis HW management service of Mellanox systems
Documentation=man:hw-management.service(8)
Wants=hw-management-sync.service
+Before=determine-reboot-cause.service

[Service]
Type=oneshot
--
2.34.1
2.43.0

Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
From 69d083e05e39dc82567a338a59a2cefdcd022034 Mon Sep 17 00:00:00 2001
From f738741680505e42217415a6b5dfc70eae347398 Mon Sep 17 00:00:00 2001
From: davidza <davidza@nvidia.com>
Date: Tue, 15 Oct 2024 09:51:11 +0300
Subject: [PATCH 3/3] Make system-health service starts after hw-management to
Subject: [PATCH 2/2] Make system-health service starts after hw-management to
avoid failures

On SN2410, it can fail to read the file led_status_capability if it starts from ONIE
Expand All @@ -12,18 +12,18 @@ Signed-off-by: Stephen Sun <stephens@nvidia.com>
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/debian/hw-management.hw-management.service b/debian/hw-management.hw-management.service
index 1a50dc3c..8a5c0423 100755
index 1c25ffb2..639bd3cd 100755
--- a/debian/hw-management.hw-management.service
+++ b/debian/hw-management.hw-management.service
@@ -2,7 +2,7 @@
@@ -1,7 +1,7 @@
[Unit]
Description=Chassis HW management service of Mellanox systems
Documentation=man:hw-management.service(8)
Wants=hw-management-sync.service
-Before=determine-reboot-cause.service
+Before=determine-reboot-cause.service system-health.service watchdog-control.service

[Service]
Type=oneshot
--
2.34.1
2.43.0

79 changes: 77 additions & 2 deletions platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@
except ImportError as e:
raise ImportError (str(e) + "- required module not found")

try:
import hw_management_independent_mode_update
except ImportError:
# Only mock if running under pytest (check if pytest is imported)
import sys
if 'pytest' in sys.modules:
from unittest import mock
hw_management_independent_mode_update = mock.MagicMock()
hw_management_independent_mode_update.vendor_data_set_module = mock.MagicMock()
else:
raise

# Define the sdk constants
SX_PORT_MODULE_STATUS_INITIALIZING = 0
SX_PORT_MODULE_STATUS_PLUGGED = 1
Expand Down Expand Up @@ -426,6 +438,9 @@ def __init__(self, sfp_index, sfp_type=None, slot_id=0, linecard_port_count=0, l
self.sn = None
self.temp_high_threshold = None
self.temp_critical_threshold = None
self.manufacturer = None
self.part_number = None
self.retry_read_vendor = 5

def __str__(self):
return f'SFP {self.sdk_index}'
Expand Down Expand Up @@ -858,19 +873,79 @@ def reinit_if_sn_changed(self):
self.temp_critical_threshold = None
return True
return False


def get_vendor_info(self):
"""Get SFP vendor info (manufacturer and part number).
Reads fields via xcvr_eeprom to avoid manual offset logic.
Uses cache to avoid redundant reads. Retries are aligned with threshold retry style
using a stateful counter across invocations.
Returns:
tuple: (manufacturer, part_number) or (None, None) if read fails
"""
try:
display_idx = self.sdk_index + 1
if self.manufacturer is not None and self.part_number is not None:
return self.manufacturer, self.part_number

api = self.get_xcvr_api()
if not api or api.xcvr_eeprom is None:
return None, None

# Single-attempt per call; subsequent calls will retry while counter > 0
try:
manufacturer = api.xcvr_eeprom.read(consts.VENDOR_NAME_FIELD)
part_number = api.xcvr_eeprom.read(consts.VENDOR_PART_NO_FIELD)
logger.log_debug(f"SFP {display_idx} vendor info read: manufacturer='{manufacturer}', part_number='{part_number}'")
except Exception as e:
logger.log_debug(f"SFP {display_idx} vendor info read failed: {e}")
manufacturer = None
part_number = None

if manufacturer and part_number:
self.manufacturer = manufacturer
self.part_number = part_number
# Stop retrying after success
self.retry_read_vendor = 0
return manufacturer, part_number

# Defer retry to subsequent calls, aligned with threshold retry logic
if self.retry_read_vendor > 0:
self.retry_read_vendor -= 1
if self.retry_read_vendor == 0:
logger.log_notice(f"SFP {display_idx}: vendor info unavailable after retries")

return None, None
except Exception:
return None, None

def get_temperature_info(self):
"""Get SFP temperature info in a fast way. This function is faster than calling following functions one by one: get_temperature, get_temperature_warning_threshold, get_temperature_critical_threshold.

Returns:
tuple: (temperature, warning_threshold, critical_threshold)
"""
try:
sn_changed = self.reinit_if_sn_changed()
if sn_changed:
# On module change, publish vendor info to hw-management
try:
manufacturer, part_number = self.get_vendor_info()
if manufacturer and part_number:
vendor_info = {'manufacturer': manufacturer, 'part_number': part_number}
hw_management_independent_mode_update.vendor_data_set_module(
0, # ASIC index always 0 for now
self.sdk_index + 1,
vendor_info
)
logger.log_notice(f'Module {self.sdk_index + 1} vendor info updated - '
f'manufacturer: {manufacturer} part_number: {part_number}')
except Exception as e:
logger.log_warning(f'Failed to publish vendor info for SFP {self.sdk_index + 1} - {e}')

sw_control = self.is_sw_control()
if not sw_control:
return sw_control, None, None, None

sn_changed = self.reinit_if_sn_changed()
# software control, read from EEPROM
temperature = super().get_temperature()
if temperature is None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,17 @@
try:
import hw_management_independent_mode_update
except ImportError:
# For unit test only
from unittest import mock
hw_management_independent_mode_update = mock.MagicMock()
hw_management_independent_mode_update.module_data_set_module_counter = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_module = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock()
# Only mock if running under pytest (check if pytest is imported)
if 'pytest' in sys.modules:
from unittest import mock
hw_management_independent_mode_update = mock.MagicMock()
hw_management_independent_mode_update.module_data_set_module_counter = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_set_module = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_asic = mock.MagicMock()
hw_management_independent_mode_update.thermal_data_clean_module = mock.MagicMock()
else:
raise


SFP_TEMPERATURE_SCALE = 1000
Expand Down
94 changes: 92 additions & 2 deletions platform/mellanox/mlnx-platform-api/tests/test_sfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
modules_path = os.path.dirname(test_path)
sys.path.insert(0, modules_path)

from sonic_platform.sfp import SFP, RJ45Port, CpoPort, CPO_TYPE, cmis_api, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED
from sonic_platform.sfp import SFP, NvidiaSFPCommon, RJ45Port, CpoPort, CPO_TYPE, cmis_api, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED
from sonic_platform.chassis import Chassis


Expand Down Expand Up @@ -246,8 +246,10 @@ def test_get_page_and_page_offset(self, mock_get_type_str, mock_eeprom_path, moc

@mock.patch('sonic_platform.utils.read_int_from_file')
@mock.patch('sonic_platform.sfp.SFP._read_eeprom')
def test_sfp_get_presence(self, mock_read, mock_read_int):
@mock.patch('sonic_platform.sfp.SFP.is_sw_control')
def test_sfp_get_presence(self, mock_is_sw_control, mock_read, mock_read_int):
sfp = SFP(0)
mock_is_sw_control.return_value = False

mock_read_int.return_value = 1
mock_read.return_value = None
Expand Down Expand Up @@ -607,3 +609,91 @@ def mock_read(field):
assert sfp.get_temperature_info() == (True, 58.0, 75.0, 85.0)
sfp.is_sw_control.side_effect = Exception('')
assert sfp.get_temperature_info() == (False, None, None, None)

@mock.patch('time.sleep', mock.MagicMock())
def test_get_vendor_info_success_and_cache(self):
sfp = SFP(0)
mock_api = mock.MagicMock()
mock_eeprom = mock.MagicMock()
mock_api.xcvr_eeprom = mock_eeprom
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)

from sonic_platform_base.sonic_xcvr.fields import consts
def mock_read(field):
if field == consts.VENDOR_NAME_FIELD:
return 'Mellanox'
if field == consts.VENDOR_PART_NO_FIELD:
return 'PN-1234'
return None
mock_eeprom.read.side_effect = mock_read

# First call reads from eeprom
manufacturer, part_number = sfp.get_vendor_info()
assert manufacturer == 'Mellanox'
assert part_number == 'PN-1234'
assert mock_eeprom.read.call_count == 2

# Second call should return cached values without additional reads
manufacturer, part_number = sfp.get_vendor_info()
assert manufacturer == 'Mellanox'
assert part_number == 'PN-1234'
assert mock_eeprom.read.call_count == 2

@mock.patch('time.sleep', mock.MagicMock())
def test_get_vendor_info_retry_then_success(self):
sfp = SFP(0)
mock_api = mock.MagicMock()
mock_eeprom = mock.MagicMock()
mock_api.xcvr_eeprom = mock_eeprom
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)

from sonic_platform_base.sonic_xcvr.fields import consts
state = {'fail_reads': 2}
def flaky_read(field):
if state['fail_reads'] > 0:
state['fail_reads'] -= 1
raise Exception('EEPROM not ready')
if field == consts.VENDOR_NAME_FIELD:
return 'Mellanox'
if field == consts.VENDOR_PART_NO_FIELD:
return 'PN-5678'
return None
mock_eeprom.read.side_effect = flaky_read

# First call fails (counter decremented)
manufacturer, part_number = sfp.get_vendor_info()
assert (manufacturer, part_number) == (None, None)
# Second call fails (counter decremented)
manufacturer, part_number = sfp.get_vendor_info()
assert (manufacturer, part_number) == (None, None)
# Third call succeeds (reads both fields)
manufacturer, part_number = sfp.get_vendor_info()
assert (manufacturer, part_number) == ('Mellanox', 'PN-5678')
# Total read invocations: first two calls each raise on first field (2),
# third call reads both fields (2) → 4 total
assert mock_eeprom.read.call_count == 4

@mock.patch('time.sleep', mock.MagicMock())
def test_get_vendor_info_all_fail(self):
sfp = SFP(0)
mock_api = mock.MagicMock()
mock_eeprom = mock.MagicMock()
mock_api.xcvr_eeprom = mock_eeprom
sfp.get_xcvr_api = mock.MagicMock(return_value=mock_api)
mock_eeprom.read.side_effect = Exception('EEPROM error')

manufacturer, part_number = sfp.get_vendor_info()
assert manufacturer is None
assert part_number is None

def test_get_vendor_info_no_api_or_missing_attr(self):
sfp = SFP(0)
# No API
sfp.get_xcvr_api = mock.MagicMock(return_value=None)
assert sfp.get_vendor_info() == (None, None)

# API without xcvr_eeprom attribute
class DummyApi(object):
pass
sfp.get_xcvr_api.return_value = DummyApi()
assert sfp.get_vendor_info() == (None, None)
43 changes: 43 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/test_thermal_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,46 @@ def test_update_module(self):
hw_management_independent_mode_update.reset_mock()
updater.update_module()
hw_management_independent_mode_update.thermal_data_set_module.assert_called_once_with(0, 11, 0, 0, 0, 0)

# ---- SFP.get_temperature_info publishes vendor info on module change ----
def _make_sfp_for_publish(self, sn_changed=True, vendor=('Innolight', 'TR-iQ13L-NVS')):
# Import locally to avoid any potential name resolution issues in test scope
from sonic_platform.sfp import SFP as _SFP
sfp = object.__new__(_SFP)
sfp.sdk_index = 10
sfp.is_sw_control = mock.MagicMock(return_value=True)
sfp.reinit_if_sn_changed = mock.MagicMock(return_value=sn_changed)
if vendor is None:
sfp.get_vendor_info = mock.MagicMock(return_value=(None, None))
else:
sfp.get_vendor_info = mock.MagicMock(return_value=vendor)
api = mock.MagicMock()
api.get_transceiver_thresholds_support = mock.MagicMock(return_value=False)
sfp.get_xcvr_api = mock.MagicMock(return_value=api)
return sfp

def test_sfp_get_temperature_info_publishes_vendor_on_sn_change(self):
from sonic_platform.sfp import hw_management_independent_mode_update as sfp_hw_management_independent_mode_update
sfp = self._make_sfp_for_publish(sn_changed=True, vendor=('Innolight', 'TR-iQ13L-NVS'))
with mock.patch('sonic_platform.sfp.SfpOptoeBase.get_temperature', return_value=55.0):
sfp_hw_management_independent_mode_update.reset_mock()
sfp.get_temperature_info()
sfp_hw_management_independent_mode_update.vendor_data_set_module.assert_called_once_with(
0, 11, {'manufacturer': 'Innolight', 'part_number': 'TR-iQ13L-NVS'}
)

def test_sfp_get_temperature_info_no_publish_when_no_change(self):
from sonic_platform.sfp import hw_management_independent_mode_update as sfp_hw_management_independent_mode_update
sfp = self._make_sfp_for_publish(sn_changed=False, vendor=('Innolight', 'TR-iQ13L-NVS'))
with mock.patch('sonic_platform.sfp.SfpOptoeBase.get_temperature', return_value=55.0):
sfp_hw_management_independent_mode_update.reset_mock()
sfp.get_temperature_info()
sfp_hw_management_independent_mode_update.vendor_data_set_module.assert_not_called()

def test_sfp_get_temperature_info_no_publish_when_vendor_missing(self):
from sonic_platform.sfp import hw_management_independent_mode_update as sfp_hw_management_independent_mode_update
sfp = self._make_sfp_for_publish(sn_changed=True, vendor=None)
with mock.patch('sonic_platform.sfp.SfpOptoeBase.get_temperature', return_value=55.0):
sfp_hw_management_independent_mode_update.reset_mock()
sfp.get_temperature_info()
sfp_hw_management_independent_mode_update.vendor_data_set_module.assert_not_called()