Skip to content

Commit 8951bbf

Browse files
committed
add CMIS expiration and retries
Signed-off-by: Dante Su <dante.su@broadcom.com>
1 parent a258f2a commit 8951bbf

1 file changed

Lines changed: 168 additions & 52 deletions

File tree

sonic-xcvrd/xcvrd/xcvrd.py

Lines changed: 168 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import sys
1717
import threading
1818
import time
19+
import datetime
1920
import subprocess
2021

2122
from sonic_py_common import daemon_base, device_info, logger
@@ -850,7 +851,10 @@ def is_fast_reboot_enabled():
850851

851852
class CmisManagerTask:
852853

853-
NUM_CHANNELS = 8
854+
CMIS_MAX_RETRIES = 3
855+
CMIS_DEF_EXPIRED = 60 # seconds, default expiration time
856+
CMIS_MODULE_TYPES = ['QSFP-DD', 'QSFP_DD', 'OSFP']
857+
CMIS_NUM_CHANNELS = 8
854858

855859
CMIS_STATE_UNKNOWN = 'UNKNOWN'
856860
CMIS_STATE_INSERTED = 'INSERTED'
@@ -899,9 +903,9 @@ def on_port_update_event(self, port_change_event):
899903
if pport is None:
900904
return
901905

902-
# Skip if the port/cage type is not QSFP-DD
906+
# Skip if the port/cage type is not a CMIS
903907
ptype = _wrapper_get_sfp_type(pport)
904-
if ptype not in ['QSFP-DD', 'QSFP_DD']:
908+
if ptype not in self.CMIS_MODULE_TYPES:
905909
return
906910

907911
if lport not in self.port_dict:
@@ -915,6 +919,7 @@ def on_port_update_event(self, port_change_event):
915919
if port_change_event.port_dict is not None and 'lanes' in port_change_event.port_dict:
916920
self.port_dict[lport]['lanes'] = port_change_event.port_dict['lanes']
917921
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_INSERTED
922+
self.reset_cmis_init(lport, 0)
918923
else:
919924
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_REMOVED
920925

@@ -968,7 +973,7 @@ def get_cmis_application_desired(self, api, channel, speed):
968973
return 0
969974

970975
host_lane_count = 0
971-
for lane in range(self.NUM_CHANNELS):
976+
for lane in range(self.CMIS_NUM_CHANNELS):
972977
if ((1 << lane) & channel) == 0:
973978
continue
974979
host_lane_count += 1
@@ -1011,7 +1016,7 @@ def is_cmis_application_update_required(self, api, channel, speed):
10111016
return False
10121017

10131018
app_old = 0
1014-
for lane in range(api.NUM_CHANNELS):
1019+
for lane in range(self.CMIS_NUM_CHANNELS):
10151020
if ((1 << lane) & channel) == 0:
10161021
continue
10171022
if app_old == 0:
@@ -1025,7 +1030,7 @@ def is_cmis_application_update_required(self, api, channel, speed):
10251030
skip = True
10261031
dp_state = api.get_datapath_state()
10271032
conf_state = api.get_config_datapath_hostlane_status()
1028-
for lane in range(api.NUM_CHANNELS):
1033+
for lane in range(self.CMIS_NUM_CHANNELS):
10291034
if ((1 << lane) & channel) == 0:
10301035
continue
10311036
name = "DP{}State".format(lane + 1)
@@ -1040,6 +1045,82 @@ def is_cmis_application_update_required(self, api, channel, speed):
10401045

10411046
return True
10421047

1048+
def reset_cmis_init(self, lport, retries=0):
1049+
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_INSERTED
1050+
self.port_dict[lport]['cmis_retries'] = retries
1051+
self.port_dict[lport]['cmis_expired'] = None # No expiration
1052+
1053+
def test_module_state(self, api, states):
1054+
"""
1055+
Check if the CMIS module is in the specified state
1056+
1057+
Args:
1058+
api:
1059+
XcvrApi object
1060+
states:
1061+
List, a string list of states
1062+
1063+
Returns:
1064+
Boolean, true if it's in the specified state, otherwise false
1065+
"""
1066+
return api.get_module_state() in states
1067+
1068+
def test_config_error(self, api, channel, states):
1069+
"""
1070+
Check if the CMIS configuration states are in the specified state
1071+
1072+
Args:
1073+
api:
1074+
XcvrApi object
1075+
channel:
1076+
Integer, a bitmask of the lanes on the host side
1077+
e.g. 0x5 for lane 0 and lane 2.
1078+
states:
1079+
List, a string list of states
1080+
1081+
Returns:
1082+
Boolean, true if all lanes are in the specified state, otherwise false
1083+
"""
1084+
done = True
1085+
cerr = api.get_config_datapath_hostlane_status()
1086+
for lane in range(self.CMIS_NUM_CHANNELS):
1087+
if ((1 << lane) & channel) == 0:
1088+
continue
1089+
key = "ConfigStatusLane{}".format(lane + 1)
1090+
if cerr[key] not in states:
1091+
done = False
1092+
break
1093+
1094+
return done
1095+
1096+
def test_datapath_state(self, api, channel, states):
1097+
"""
1098+
Check if the CMIS datapath states are in the specified state
1099+
1100+
Args:
1101+
api:
1102+
XcvrApi object
1103+
channel:
1104+
Integer, a bitmask of the lanes on the host side
1105+
e.g. 0x5 for lane 0 and lane 2.
1106+
states:
1107+
List, a string list of states
1108+
1109+
Returns:
1110+
Boolean, true if all lanes are in the specified state, otherwise false
1111+
"""
1112+
done = True
1113+
dpstate = api.get_datapath_state()
1114+
for lane in range(self.CMIS_NUM_CHANNELS):
1115+
if ((1 << lane) & channel) == 0:
1116+
continue
1117+
key = "DP{}State".format(lane + 1)
1118+
if dpstate[key] not in states:
1119+
done = False
1120+
break
1121+
1122+
return done
1123+
10431124
def task_worker(self):
10441125
self.log_notice("Starting...")
10451126

@@ -1105,98 +1186,133 @@ def task_worker(self):
11051186
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY
11061187
continue
11071188

1108-
# Skip if it's not a QSFP-DD
1189+
# Skip if it's not a CMIS module
11091190
type = api.get_module_type_abbreviation()
1110-
if (type is None) or (type not in ['QSFP-DD', 'QSFP_DD']):
1191+
if (type is None) or (type not in self.CMIS_MODULE_TYPES):
11111192
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY
11121193
continue
11131194
except AttributeError:
11141195
# Skip if these essential routines are not available
11151196
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY
11161197
continue
11171198

1118-
self.log_notice("{}: {}G, lanemask=0x{:x}, state={}".format(
1119-
lport, int(speed/1000), host_lanes, state))
1199+
# CMIS expiration and retries
1200+
#
1201+
# A retry should always start over at INSETRTED state, while the
1202+
# expiration will reset the state to INSETRTED and advance the
1203+
# retry counter
1204+
now = datetime.datetime.now()
1205+
expired = self.port_dict[lport].get('cmis_expired')
1206+
retries = self.port_dict[lport].get('cmis_retries', 0)
1207+
self.log_notice("{}: {}G, lanemask=0x{:x}, state={}, retries={}".format(
1208+
lport, int(speed/1000), host_lanes, state, retries))
1209+
if retries > self.CMIS_MAX_RETRIES:
1210+
self.log_error("{}: FAILED".format(lport))
1211+
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED
1212+
continue
11201213

11211214
try:
1215+
# CMIS state transitions
11221216
if state == self.CMIS_STATE_INSERTED:
1217+
1218+
appl = self.get_cmis_application_desired(api, host_lanes, host_speed)
1219+
if appl < 1:
1220+
self.log_error("{}: no suitable app for the port".format(lport))
1221+
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED
1222+
continue
1223+
11231224
has_update = self.is_cmis_application_update_required(api, host_lanes, host_speed)
11241225
if not has_update:
11251226
# No application updates
1126-
state = self.CMIS_STATE_READY
1127-
self.log_notice("{}: state={}".format(lport, state))
1128-
self.port_dict[lport]['cmis_state'] = state
1227+
self.log_notice("{}: READY".format(lport))
1228+
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY
11291229
continue
1130-
appl = self.get_cmis_application_desired(api, host_lanes, host_speed)
1131-
self.port_dict[lport]['cmis_apsel'] = appl
11321230

11331231
# D.2.2 Software Deinitialization
11341232
api.set_datapath_deinit(host_lanes)
11351233
api.set_lpmode(True)
1234+
if not self.test_module_state(api, ['ModuleReady', 'ModuleLowPwr']):
1235+
self.log_notice("{}: unable to enter low-power mode".format(lport))
1236+
self.port_dict[lport]['cmis_retries'] = retries + 1
1237+
continue
1238+
11361239
# D.1.3 Software Configuration and Initialization
1137-
api.tx_disable_channel(host_lanes, True)
1240+
if not api.tx_disable_channel(host_lanes, True):
1241+
self.log_notice("{}: unable to turn off tx power".format(lport))
1242+
self.port_dict[lport]['cmis_retries'] = retries + 1
1243+
continue
11381244
api.set_lpmode(False)
11391245

1246+
# TODO: Use fine grained time when the CMIS memory map is available
1247+
self.port_dict[lport]['cmis_expired'] = now + datetime.timedelta(seconds=self.CMIS_DEF_EXPIRED)
11401248
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_DP_DEINIT
11411249
elif state == self.CMIS_STATE_DP_DEINIT:
1142-
if api.get_module_state() != 'ModuleReady':
1250+
if not self.test_module_state(api, ['ModuleReady']):
1251+
if (expired is not None) and (expired <= now):
1252+
self.log_notice("{}: timeout for 'ModuleReady'".format(lport))
1253+
self.reset_cmis_init(lport, retries + 1)
1254+
continue
1255+
if not self.test_datapath_state(api, host_lanes, ['DataPathDeinit', 'DataPathDeactivated']):
1256+
if (expired is not None) and (expired <= now):
1257+
self.log_notice("{}: timeout for 'DataPathDeinit'".format(lport))
1258+
self.reset_cmis_init(lport, retries + 1)
11431259
continue
11441260

11451261
# D.1.3 Software Configuration and Initialization
1146-
api.set_application(host_lanes, self.port_dict[lport]['cmis_apsel'])
1262+
appl = self.get_cmis_application_desired(api, host_lanes, host_speed)
1263+
if appl < 1:
1264+
self.log_error("{}: no suitable app for the port".format(lport))
1265+
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED
1266+
continue
1267+
1268+
if not api.set_application(host_lanes, appl):
1269+
self.log_notice("{}: unable to set application".format(lport))
1270+
self.reset_cmis_init(lport, retries + 1)
1271+
continue
11471272

1273+
# TODO: Use fine grained time when the CMIS memory map is available
1274+
self.port_dict[lport]['cmis_expired'] = now + datetime.timedelta(seconds=self.CMIS_DEF_EXPIRED)
11481275
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_AP_CONF
11491276
elif state == self.CMIS_STATE_AP_CONF:
1150-
st = api.get_config_datapath_hostlane_status()
1151-
done = True
1152-
for lane in range(self.NUM_CHANNELS):
1153-
if ((1 << lane) & host_lanes) == 0:
1154-
continue
1155-
name = "ConfigStatusLane{}".format(lane + 1)
1156-
if st[name] != 'ConfigSuccess':
1157-
done = False
1158-
continue
1159-
if not done:
1277+
if not self.test_config_error(api, host_lanes, ['ConfigSuccess']):
1278+
if (expired is not None) and (expired <= now):
1279+
self.log_notice("{}: timeout for 'ConfigSuccess'".format(lport))
1280+
self.reset_cmis_init(lport, retries + 1)
11601281
continue
11611282

11621283
# D.1.3 Software Configuration and Initialization
11631284
api.set_datapath_init(host_lanes)
11641285

1286+
# TODO: Use fine grained time when the CMIS memory map is available
1287+
self.port_dict[lport]['cmis_expired'] = now + datetime.timedelta(seconds=self.CMIS_DEF_EXPIRED)
11651288
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_DP_INIT
11661289
elif state == self.CMIS_STATE_DP_INIT:
1167-
st = api.get_datapath_state()
1168-
done = True
1169-
for lane in range(self.NUM_CHANNELS):
1170-
if ((1 << lane) & host_lanes) == 0:
1171-
continue
1172-
name = "DP{}State".format(lane + 1)
1173-
if st[name] != 'DataPathInitialized':
1174-
done = False
1175-
continue
1176-
if not done:
1290+
if not self.test_datapath_state(api, host_lanes, ['DataPathInitialized']):
1291+
if (expired is not None) and (expired <= now):
1292+
self.log_notice("{}: timeout for 'DataPathInitialized'".format(lport))
1293+
self.reset_cmis_init(lport, retries + 1)
11771294
continue
11781295

11791296
# D.1.3 Software Configuration and Initialization
1180-
api.tx_disable_channel(host_lanes, False)
1297+
if not api.tx_disable_channel(host_lanes, False):
1298+
self.log_notice("{}: unable to turn on tx power".format(lport))
1299+
self.reset_cmis_init(lport, retries + 1)
1300+
continue
11811301

1302+
# TODO: Use fine grained timeout when the CMIS memory map is available
1303+
self.port_dict[lport]['cmis_expired'] = now + datetime.timedelta(seconds=self.CMIS_DEF_EXPIRED)
11821304
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_DP_TXON
11831305
elif state == self.CMIS_STATE_DP_TXON:
1184-
st = api.get_datapath_state()
1185-
done = True
1186-
for lane in range(self.NUM_CHANNELS):
1187-
if ((1 << lane) & host_lanes) == 0:
1188-
continue
1189-
name = "DP{}State".format(lane + 1)
1190-
if st[name] != 'DataPathActivated':
1191-
done = False
1192-
continue
1193-
if not done:
1306+
if not self.test_datapath_state(api, host_lanes, ['DataPathActivated']):
1307+
if (expired is not None) and (expired <= now):
1308+
self.log_notice("{}: timeout for 'DataPathActivated'".format(lport))
1309+
self.reset_cmis_init(lport, retries + 1)
11941310
continue
1195-
state = self.CMIS_STATE_READY
1196-
self.log_notice("{}: state={}".format(lport, state))
1197-
self.port_dict[lport]['cmis_state'] = state
1311+
self.log_notice("{}: READY".format(lport))
1312+
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_READY
11981313

11991314
except (NotImplementedError, AttributeError):
1315+
self.log_error("{}: internal errors".format(lport))
12001316
self.port_dict[lport]['cmis_state'] = self.CMIS_STATE_FAILED
12011317

12021318
self.log_notice("Stopped")

0 commit comments

Comments
 (0)