Skip to content

Commit cb5135f

Browse files
Merge pull request #80 from stephenxs/sfp-bit-map-enhance
Support handling the error status of SFP modules
2 parents df81073 + d4b63df commit cb5135f

5 files changed

Lines changed: 210 additions & 42 deletions

File tree

platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -571,18 +571,22 @@ def get_change_event(self, timeout=0):
571571

572572
wait_for_ever = (timeout == 0)
573573
port_dict = {}
574+
error_dict = {}
574575
if wait_for_ever:
575576
timeout = MAX_SELECT_DELAY
576577
while True:
577-
status = self.sfp_event.check_sfp_status(port_dict, timeout)
578+
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)
578579
if bool(port_dict):
579580
break
580581
else:
581-
status = self.sfp_event.check_sfp_status(port_dict, timeout)
582+
status = self.sfp_event.check_sfp_status(port_dict, error_dict, timeout)
582583

583584
if status:
584585
self.reinit_sfps(port_dict)
585-
return True, {'sfp':port_dict}
586+
result_dict = {'sfp':port_dict}
587+
if error_dict:
588+
result_dict['sfp_error'] = error_dict
589+
return True, result_dict
586590
else:
587591
return True, {'sfp':{}}
588592

platform/mellanox/mlnx-platform-api/sonic_platform/sfp.py

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
try:
1212
import subprocess
13+
import os
1314
from sonic_platform_base.sfp_base import SfpBase
1415
from sonic_platform_base.sonic_eeprom import eeprom_dts
1516
from sonic_platform_base.sonic_sfp.sff8472 import sff8472InterfaceId
@@ -35,6 +36,18 @@
3536
except ImportError as e:
3637
pass
3738

39+
try:
40+
if os.environ["PLATFORM_API_UNIT_TESTING"] == "1":
41+
# Unable to import SDK constants under unit test
42+
# Define them here
43+
SX_PORT_MODULE_STATUS_INITIALIZING = 0
44+
SX_PORT_MODULE_STATUS_PLUGGED = 1
45+
SX_PORT_MODULE_STATUS_UNPLUGGED = 2
46+
SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR = 3
47+
SX_PORT_MODULE_STATUS_PLUGGED_DISABLED = 4
48+
except KeyError:
49+
pass
50+
3851
# definitions of the offset and width for values in XCVR info eeprom
3952
XCVR_INTFACE_BULK_OFFSET = 0
4053
XCVR_INTFACE_BULK_WIDTH_QSFP = 20
@@ -330,6 +343,18 @@ def __exit__(self, exc_type, exc_val, exc_tb):
330343
class SFP(SfpBase):
331344
"""Platform-specific SFP class"""
332345

346+
SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE = 'Long range for non-Mellanox cable or module'
347+
SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST = 'Enforce part number list'
348+
SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED = 'PMD type not enabled'
349+
SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED = 'PCIE system power slot exceeded'
350+
SFP_MLNX_ERROR_DESCRIPTION_RESERVED = 'Reserved'
351+
352+
SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE = 0x00010000
353+
SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST = 0x00020000
354+
SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED = 0x00040000
355+
SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED = 0x00080000
356+
SFP_MLNX_ERROR_BIT_RESERVED = 0x80000000
357+
333358
def __init__(self, sfp_index, sfp_type, sdk_handle_getter, platform):
334359
SfpBase.__init__(self)
335360
self.index = sfp_index + 1
@@ -388,7 +413,7 @@ def get_presence(self):
388413
# Read out any bytes from any offset
389414
def _read_eeprom_specific_bytes(self, offset, num_bytes):
390415
eeprom_raw = []
391-
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {}".format(self.index, offset, num_bytes)
416+
ethtool_cmd = "ethtool -m sfp{} hex on offset {} length {} 2>/dev/null".format(self.index, offset, num_bytes)
392417
try:
393418
output = subprocess.check_output(ethtool_cmd,
394419
shell=True,
@@ -2158,3 +2183,68 @@ def is_replaceable(self):
21582183
bool: True if it is replaceable.
21592184
"""
21602185
return True
2186+
2187+
def _get_error_code(self):
2188+
"""
2189+
Get error code of the SFP module
2190+
2191+
Returns:
2192+
The error code fetch from SDK API
2193+
"""
2194+
module_id_info_list = new_sx_mgmt_module_id_info_t_arr(1)
2195+
module_info_list = new_sx_mgmt_phy_module_info_t_arr(1)
2196+
2197+
module_id_info = sx_mgmt_module_id_info_t()
2198+
module_id_info.slot_id = 0
2199+
module_id_info.module_id = self.sdk_index
2200+
sx_mgmt_module_id_info_t_arr_setitem(module_id_info_list, 0, module_id_info)
2201+
2202+
rc = sx_mgmt_phy_module_info_get(self.sdk_handle, module_id_info_list, 1, module_info_list)
2203+
assert SX_STATUS_SUCCESS == rc, "sx_mgmt_phy_module_info_get failed, error code {}".format(rc)
2204+
2205+
mod_info = sx_mgmt_phy_module_info_t_arr_getitem(module_info_list, 0)
2206+
return mod_info.module_state.oper_state, mod_info.module_state.error_type
2207+
2208+
@classmethod
2209+
def _get_error_description_dict(cls):
2210+
return {0: cls.SFP_ERROR_DESCRIPTION_POWER_BUDGET_EXCEEDED,
2211+
1: cls.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
2212+
2: cls.SFP_ERROR_DESCRIPTION_I2C_STUCK,
2213+
3: cls.SFP_ERROR_DESCRIPTION_BAD_EEPROM,
2214+
4: cls.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
2215+
5: cls.SFP_ERROR_DESCRIPTION_UNSUPPORTED_CABLE,
2216+
6: cls.SFP_ERROR_DESCRIPTION_HIGH_TEMP,
2217+
7: cls.SFP_ERROR_DESCRIPTION_BAD_CABLE,
2218+
8: cls.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
2219+
12: cls.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED,
2220+
255: cls.SFP_MLNX_ERROR_DESCRIPTION_RESERVED
2221+
}
2222+
2223+
def get_error_description(self):
2224+
"""
2225+
Get error description
2226+
2227+
Args:
2228+
error_code: The error code returned by _get_error_code
2229+
2230+
Returns:
2231+
The error description
2232+
"""
2233+
oper_status, error_code = self._get_error_code()
2234+
if oper_status == SX_PORT_MODULE_STATUS_INITIALIZING:
2235+
error_description = self.SFP_STATUS_INITIALIZING
2236+
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED:
2237+
error_description = self.SFP_STATUS_OK
2238+
elif oper_status == SX_PORT_MODULE_STATUS_UNPLUGGED:
2239+
error_description = self.SFP_STATUS_UNPLUGGED
2240+
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_DISABLED:
2241+
error_description = self.SFP_STATUS_DISABLED
2242+
elif oper_status == SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR:
2243+
error_description_dict = self._get_error_description_dict()
2244+
if error_code in error_description_dict:
2245+
error_description = error_description_dict[error_code]
2246+
else:
2247+
error_description = "Unknown error ({})".format(error_code)
2248+
else:
2249+
error_description = "Unknow SFP module status ({})".format(oper_status)
2250+
return error_description

platform/mellanox/mlnx-platform-api/sonic_platform/sfp_event.py

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class MockSxFd(object):
1717
new_sx_fd_t_p = MagicMock(return_value=MockSxFd())
1818
new_sx_user_channel_t_p = MagicMock()
1919
from sonic_py_common.logger import Logger
20+
from .sfp import SFP
2021

2122
# SFP status from PMAOS register
2223
# 0x1 plug in
@@ -30,16 +31,6 @@ class MockSxFd(object):
3031
SDK_SFP_STATE_ERR = 0x3
3132
SDK_SFP_STATE_DIS = 0x4
3233

33-
# SFP status that will be handled by XCVRD
34-
STATUS_PLUGIN = '1' # 00000001
35-
STATUS_PLUGOUT = '0' # 00000000
36-
# SFP error status always come with STATUS_PLUGIN, so the last bit is always 1
37-
STATUS_ERR_I2C_STUCK = '3' # 00000011
38-
STATUS_ERR_BAD_EEPROM = '5' # 00000101
39-
STATUS_ERR_UNSUPPORTED_CABLE = '9' # 00001001
40-
STATUS_ERR_HIGH_TEMP = '17' # 00010001
41-
STATUS_ERR_BAD_CABLE = '33' # 00100001
42-
4334
# SFP status used in this file only, will not expose to XCVRD
4435
# STATUS_ERROR will be mapped to different status according to the error code
4536
STATUS_UNKNOWN = '-1'
@@ -69,19 +60,39 @@ class MockSxFd(object):
6960
'''
7061

7162
# SFP errors that will block eeprom accessing
72-
sdk_sfp_err_type_dict = {
73-
0x2: STATUS_ERR_I2C_STUCK,
74-
0x3: STATUS_ERR_BAD_EEPROM,
75-
0x5: STATUS_ERR_UNSUPPORTED_CABLE,
76-
0x6: STATUS_ERR_HIGH_TEMP,
77-
0x7: STATUS_ERR_BAD_CABLE
63+
SDK_SFP_BLOCKING_ERRORS = [
64+
0x2, # SFP.SFP_ERROR_BIT_I2C_STUCK,
65+
0x3, # SFP.SFP_ERROR_BIT_BAD_EEPROM,
66+
0x5, # SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
67+
0x6, # SFP.SFP_ERROR_BIT_HIGH_TEMP,
68+
0x7, # SFP.SFP_ERROR_BIT_BAD_CABLE
69+
]
70+
71+
SDK_ERRORS_TO_ERROR_BITS = {
72+
0x0: SFP.SFP_ERROR_BIT_POWER_BUDGET_EXCEEDED,
73+
0x1: SFP.SFP_MLNX_ERROR_BIT_LONGRANGE_NON_MLNX_CABLE,
74+
0x2: SFP.SFP_ERROR_BIT_I2C_STUCK,
75+
0x3: SFP.SFP_ERROR_BIT_BAD_EEPROM,
76+
0x4: SFP.SFP_MLNX_ERROR_BIT_ENFORCE_PART_NUMBER_LIST,
77+
0x5: SFP.SFP_ERROR_BIT_UNSUPPORTED_CABLE,
78+
0x6: SFP.SFP_ERROR_BIT_HIGH_TEMP,
79+
0x7: SFP.SFP_ERROR_BIT_BAD_CABLE,
80+
0x8: SFP.SFP_MLNX_ERROR_BIT_PMD_TYPE_NOT_ENABLED,
81+
0xc: SFP.SFP_MLNX_ERROR_BIT_PCIE_POWER_SLOT_EXCEEDED
82+
}
83+
84+
SDK_ERRORS_TO_DESCRIPTION = {
85+
0x1: SFP.SFP_MLNX_ERROR_DESCRIPTION_LONGRANGE_NON_MLNX_CABLE,
86+
0x4: SFP.SFP_MLNX_ERROR_DESCRIPTION_ENFORCE_PART_NUMBER_LIST,
87+
0x8: SFP.SFP_MLNX_ERROR_DESCRIPTION_PMD_TYPE_NOT_ENABLED,
88+
0xc: SFP.SFP_MLNX_ERROR_DESCRIPTION_PCIE_POWER_SLOT_EXCEEDED
7889
}
7990

8091
sfp_value_status_dict = {
81-
SDK_SFP_STATE_IN: STATUS_PLUGIN,
82-
SDK_SFP_STATE_OUT: STATUS_PLUGOUT,
92+
SDK_SFP_STATE_IN: str(SFP.SFP_STATUS_BIT_INSERTED),
93+
SDK_SFP_STATE_OUT: str(SFP.SFP_STATUS_BIT_REMOVED),
8394
SDK_SFP_STATE_ERR: STATUS_ERROR,
84-
SDK_SFP_STATE_DIS: STATUS_PLUGOUT,
95+
SDK_SFP_STATE_DIS: str(SFP.SFP_STATUS_BIT_REMOVED),
8596
}
8697

8798
# system level event/error
@@ -204,7 +215,7 @@ def deinitialize(self):
204215
delete_sx_fd_t_p(self.rx_fd_p)
205216
delete_sx_user_channel_t_p(self.user_channel_p)
206217

207-
def check_sfp_status(self, port_change, timeout):
218+
def check_sfp_status(self, port_change, error_dict, timeout):
208219
"""
209220
the meaning of timeout is aligned with select.select, which has the following meaning:
210221
0: poll, returns without blocked
@@ -242,6 +253,7 @@ def check_sfp_status(self, port_change, timeout):
242253
break
243254

244255
sfp_state = sfp_value_status_dict.get(module_state, STATUS_UNKNOWN)
256+
error_description = None
245257
if sfp_state == STATUS_UNKNOWN:
246258
# in the following sequence, STATUS_UNKNOWN can be returned.
247259
# so we shouldn't raise exception here.
@@ -256,18 +268,29 @@ def check_sfp_status(self, port_change, timeout):
256268

257269
# If get SFP status error(0x3) from SDK, then need to read the error_type to get the detailed error
258270
if sfp_state == STATUS_ERROR:
259-
if error_type in sdk_sfp_err_type_dict.keys():
260-
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
261-
sfp_state = sdk_sfp_err_type_dict[error_type]
262-
else:
263-
# For errors don't block the eeprom accessing, we don't report it to XCVRD
264-
logger.log_info("SFP error on port but not blocking eeprom read, error_type {}".format(error_type))
265-
found +=1
271+
sfp_state_bits = SDK_ERRORS_TO_ERROR_BITS.get(error_type)
272+
if sfp_state_bits is None:
273+
logger.log_error("Unrecognized error {} detected on ports {}".format(error_type, port_list))
274+
found += 1
266275
continue
267276

277+
if error_type in SDK_SFP_BLOCKING_ERRORS:
278+
# In SFP at error status case, need to overwrite the sfp_state with the exact error code
279+
sfp_state_bits |= SFP.SFP_ERROR_BIT_BLOCKING
280+
281+
# An error should be always set along with 'INSERTED'
282+
sfp_state_bits |= SFP.SFP_STATUS_BIT_INSERTED
283+
284+
# For vendor specific errors, the description should be returned as well
285+
error_description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)
286+
287+
sfp_state = str(sfp_state_bits)
288+
268289
for port in port_list:
269290
logger.log_info("SFP on port {} state {}".format(port, sfp_state))
270291
port_change[port+1] = sfp_state
292+
if error_description:
293+
error_dict[port+1] = error_description
271294
found += 1
272295

273296
return found != 0

platform/mellanox/mlnx-platform-api/tests/test_sfp.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
modules_path = os.path.dirname(test_path)
99
sys.path.insert(0, modules_path)
1010

11+
os.environ["PLATFORM_API_UNIT_TESTING"] = "1"
12+
1113
from sonic_py_common import device_info
12-
from sonic_platform.sfp import SFP
14+
from sonic_platform.sfp import SFP, SX_PORT_MODULE_STATUS_INITIALIZING, SX_PORT_MODULE_STATUS_PLUGGED, SX_PORT_MODULE_STATUS_UNPLUGGED, SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR, SX_PORT_MODULE_STATUS_PLUGGED_DISABLED
15+
1316
from sonic_platform.chassis import Chassis
1417

1518

@@ -26,8 +29,14 @@ def mock_get_sdk_handle(self):
2629
self.sdk_handle = 1
2730
return self.sdk_handle
2831

32+
33+
def mock_get_sfp_error_code(self):
34+
return self.oper_code, self.error_code
35+
36+
2937
device_info.get_platform = mock_get_platform
3038
SFP._read_eeprom_specific_bytes = mock_read_eeprom_specific_bytes
39+
SFP._get_error_code = mock_get_sfp_error_code
3140
Chassis.get_sdk_handle = mock_get_sdk_handle
3241

3342

@@ -82,3 +91,35 @@ def test_sfp_full_initialize_without_partial():
8291
# Verify when get_sfp is called, the SFP modules won't be initialized again
8392
sfp1 = allsfp[0]
8493
assert sfp1 == chassis.get_sfp(1)
94+
95+
96+
def test_sfp_get_error_status():
97+
chassis = Chassis()
98+
99+
# Fetch an SFP module to test
100+
sfp = chassis.get_sfp(1)
101+
102+
description_dict = sfp._get_error_description_dict()
103+
104+
sfp.oper_code = SX_PORT_MODULE_STATUS_PLUGGED_WITH_ERROR
105+
for error in description_dict.keys():
106+
sfp.error_code = error
107+
description = sfp.get_error_description()
108+
109+
assert description == description_dict[sfp.error_code]
110+
111+
sfp.error_code = -1
112+
description = sfp.get_error_description()
113+
assert description == "Unknown error (-1)"
114+
115+
expected_description_list = [
116+
(SX_PORT_MODULE_STATUS_INITIALIZING, "Initializing"),
117+
(SX_PORT_MODULE_STATUS_PLUGGED, "OK"),
118+
(SX_PORT_MODULE_STATUS_UNPLUGGED, "Unplugged"),
119+
(SX_PORT_MODULE_STATUS_PLUGGED_DISABLED, "Disabled")
120+
]
121+
for oper_code, expected_description in expected_description_list:
122+
sfp.oper_code = oper_code
123+
description = sfp.get_error_description()
124+
125+
assert description == expected_description

platform/mellanox/mlnx-platform-api/tests/test_sfp_event.py

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
modules_path = os.path.dirname(test_path)
99
sys.path.insert(0, modules_path)
1010

11+
from sonic_platform_base.sfp_base import SfpBase
12+
1113
class TestSfpEvent(object):
1214
@classmethod
1315
def setup_class(cls):
@@ -16,21 +18,29 @@ def setup_class(cls):
1618

1719
def test_check_sfp_status(self):
1820
from sonic_platform.sfp_event import SDK_SFP_STATE_IN, SDK_SFP_STATE_OUT, SDK_SFP_STATE_ERR
19-
from sonic_platform.sfp_event import STATUS_PLUGIN, STATUS_PLUGOUT
20-
from sonic_platform.sfp_event import sdk_sfp_err_type_dict
21+
from sonic_platform.sfp_event import SDK_ERRORS_TO_ERROR_BITS, SDK_ERRORS_TO_DESCRIPTION, SDK_SFP_BLOCKING_ERRORS
2122

22-
self.executor(SDK_SFP_STATE_IN, None, STATUS_PLUGIN)
23-
self.executor(SDK_SFP_STATE_OUT, None, STATUS_PLUGOUT)
24-
for error_type, error_status in sdk_sfp_err_type_dict.items():
25-
self.executor(SDK_SFP_STATE_ERR, error_type, error_status)
23+
self.executor(SDK_SFP_STATE_IN, None, SfpBase.SFP_STATUS_BIT_INSERTED)
24+
self.executor(SDK_SFP_STATE_OUT, None, SfpBase.SFP_STATUS_BIT_REMOVED)
25+
for error_type, error_status in SDK_ERRORS_TO_ERROR_BITS.items():
26+
description = SDK_ERRORS_TO_DESCRIPTION.get(error_type)
27+
if error_type in SDK_SFP_BLOCKING_ERRORS:
28+
error_status |= SfpBase.SFP_ERROR_BIT_BLOCKING
29+
error_status |= SfpBase.SFP_STATUS_BIT_INSERTED
30+
self.executor(SDK_SFP_STATE_ERR, error_type, error_status, description)
2631

27-
def executor(self, mock_module_state, mock_error_type, expect_status):
32+
def executor(self, mock_module_state, mock_error_type, expect_status, description=None):
2833
from sonic_platform.sfp_event import sfp_event
2934

3035
event = sfp_event()
3136
event.on_pmpe = MagicMock(return_value=(True, [0,1], mock_module_state, mock_error_type))
3237
port_change = {}
33-
found = event.check_sfp_status(port_change, 0)
38+
error_dict = {}
39+
found = event.check_sfp_status(port_change, error_dict, 0)
3440
assert found
35-
assert 1 in port_change and port_change[1] == expect_status
36-
assert 2 in port_change and port_change[2] == expect_status
41+
expect_status_str = str(expect_status)
42+
assert 1 in port_change and port_change[1] == expect_status_str
43+
assert 2 in port_change and port_change[2] == expect_status_str
44+
if description:
45+
assert 1 in error_dict and error_dict[1] == description
46+
assert 2 in error_dict and error_dict[2] == description

0 commit comments

Comments
 (0)