1616 import sys
1717 import threading
1818 import time
19+ import datetime
1920 import subprocess
2021
2122 from sonic_py_common import daemon_base , device_info , logger
@@ -850,7 +851,10 @@ def is_fast_reboot_enabled():
850851
851852class CmisManagerTask :
852853
853- NUM_CHANNELS = 8
854+ CMIS_MAX_RETRIES = 3
855+ CMIS_DEF_EXPIRED = 60 # seconds, default expiration time
856+ CMIS_MODULE_TYPES = ['QSFP-DD' , 'QSFP_DD' , 'OSFP' ]
857+ CMIS_NUM_CHANNELS = 8
854858
855859 CMIS_STATE_UNKNOWN = 'UNKNOWN'
856860 CMIS_STATE_INSERTED = 'INSERTED'
@@ -899,9 +903,9 @@ def on_port_update_event(self, port_change_event):
899903 if pport is None :
900904 return
901905
902- # Skip if the port/cage type is not QSFP-DD
906+ # Skip if the port/cage type is not a CMIS
903907 ptype = _wrapper_get_sfp_type (pport )
904- if ptype not in [ 'QSFP-DD' , 'QSFP_DD' ] :
908+ if ptype not in self . CMIS_MODULE_TYPES :
905909 return
906910
907911 if lport not in self .port_dict :
@@ -915,6 +919,7 @@ def on_port_update_event(self, port_change_event):
915919 if port_change_event .port_dict is not None and 'lanes' in port_change_event .port_dict :
916920 self .port_dict [lport ]['lanes' ] = port_change_event .port_dict ['lanes' ]
917921 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_INSERTED
922+ self .reset_cmis_init (lport , 0 )
918923 else :
919924 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_REMOVED
920925
@@ -968,7 +973,7 @@ def get_cmis_application_desired(self, api, channel, speed):
968973 return 0
969974
970975 host_lane_count = 0
971- for lane in range (self .NUM_CHANNELS ):
976+ for lane in range (self .CMIS_NUM_CHANNELS ):
972977 if ((1 << lane ) & channel ) == 0 :
973978 continue
974979 host_lane_count += 1
@@ -1011,7 +1016,7 @@ def is_cmis_application_update_required(self, api, channel, speed):
10111016 return False
10121017
10131018 app_old = 0
1014- for lane in range (api . NUM_CHANNELS ):
1019+ for lane in range (self . CMIS_NUM_CHANNELS ):
10151020 if ((1 << lane ) & channel ) == 0 :
10161021 continue
10171022 if app_old == 0 :
@@ -1025,7 +1030,7 @@ def is_cmis_application_update_required(self, api, channel, speed):
10251030 skip = True
10261031 dp_state = api .get_datapath_state ()
10271032 conf_state = api .get_config_datapath_hostlane_status ()
1028- for lane in range (api . NUM_CHANNELS ):
1033+ for lane in range (self . CMIS_NUM_CHANNELS ):
10291034 if ((1 << lane ) & channel ) == 0 :
10301035 continue
10311036 name = "DP{}State" .format (lane + 1 )
@@ -1040,6 +1045,82 @@ def is_cmis_application_update_required(self, api, channel, speed):
10401045
10411046 return True
10421047
1048+ def reset_cmis_init (self , lport , retries = 0 ):
1049+ self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_INSERTED
1050+ self .port_dict [lport ]['cmis_retries' ] = retries
1051+ self .port_dict [lport ]['cmis_expired' ] = None # No expiration
1052+
1053+ def test_module_state (self , api , states ):
1054+ """
1055+ Check if the CMIS module is in the specified state
1056+
1057+ Args:
1058+ api:
1059+ XcvrApi object
1060+ states:
1061+ List, a string list of states
1062+
1063+ Returns:
1064+ Boolean, true if it's in the specified state, otherwise false
1065+ """
1066+ return api .get_module_state () in states
1067+
1068+ def test_config_error (self , api , channel , states ):
1069+ """
1070+ Check if the CMIS configuration states are in the specified state
1071+
1072+ Args:
1073+ api:
1074+ XcvrApi object
1075+ channel:
1076+ Integer, a bitmask of the lanes on the host side
1077+ e.g. 0x5 for lane 0 and lane 2.
1078+ states:
1079+ List, a string list of states
1080+
1081+ Returns:
1082+ Boolean, true if all lanes are in the specified state, otherwise false
1083+ """
1084+ done = True
1085+ cerr = api .get_config_datapath_hostlane_status ()
1086+ for lane in range (self .CMIS_NUM_CHANNELS ):
1087+ if ((1 << lane ) & channel ) == 0 :
1088+ continue
1089+ key = "ConfigStatusLane{}" .format (lane + 1 )
1090+ if cerr [key ] not in states :
1091+ done = False
1092+ break
1093+
1094+ return done
1095+
1096+ def test_datapath_state (self , api , channel , states ):
1097+ """
1098+ Check if the CMIS datapath states are in the specified state
1099+
1100+ Args:
1101+ api:
1102+ XcvrApi object
1103+ channel:
1104+ Integer, a bitmask of the lanes on the host side
1105+ e.g. 0x5 for lane 0 and lane 2.
1106+ states:
1107+ List, a string list of states
1108+
1109+ Returns:
1110+ Boolean, true if all lanes are in the specified state, otherwise false
1111+ """
1112+ done = True
1113+ dpstate = api .get_datapath_state ()
1114+ for lane in range (self .CMIS_NUM_CHANNELS ):
1115+ if ((1 << lane ) & channel ) == 0 :
1116+ continue
1117+ key = "DP{}State" .format (lane + 1 )
1118+ if dpstate [key ] not in states :
1119+ done = False
1120+ break
1121+
1122+ return done
1123+
10431124 def task_worker (self ):
10441125 self .log_notice ("Starting..." )
10451126
@@ -1105,98 +1186,133 @@ def task_worker(self):
11051186 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_READY
11061187 continue
11071188
1108- # Skip if it's not a QSFP-DD
1189+ # Skip if it's not a CMIS module
11091190 type = api .get_module_type_abbreviation ()
1110- if (type is None ) or (type not in [ 'QSFP-DD' , 'QSFP_DD' ] ):
1191+ if (type is None ) or (type not in self . CMIS_MODULE_TYPES ):
11111192 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_READY
11121193 continue
11131194 except AttributeError :
11141195 # Skip if these essential routines are not available
11151196 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_READY
11161197 continue
11171198
1118- self .log_notice ("{}: {}G, lanemask=0x{:x}, state={}" .format (
1119- lport , int (speed / 1000 ), host_lanes , state ))
1199+ # CMIS expiration and retries
1200+ #
1201+ # A retry should always start over at INSETRTED state, while the
1202+ # expiration will reset the state to INSETRTED and advance the
1203+ # retry counter
1204+ now = datetime .datetime .now ()
1205+ expired = self .port_dict [lport ].get ('cmis_expired' )
1206+ retries = self .port_dict [lport ].get ('cmis_retries' , 0 )
1207+ self .log_notice ("{}: {}G, lanemask=0x{:x}, state={}, retries={}" .format (
1208+ lport , int (speed / 1000 ), host_lanes , state , retries ))
1209+ if retries > self .CMIS_MAX_RETRIES :
1210+ self .log_error ("{}: FAILED" .format (lport ))
1211+ self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_FAILED
1212+ continue
11201213
11211214 try :
1215+ # CMIS state transitions
11221216 if state == self .CMIS_STATE_INSERTED :
1217+
1218+ appl = self .get_cmis_application_desired (api , host_lanes , host_speed )
1219+ if appl < 1 :
1220+ self .log_error ("{}: no suitable app for the port" .format (lport ))
1221+ self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_FAILED
1222+ continue
1223+
11231224 has_update = self .is_cmis_application_update_required (api , host_lanes , host_speed )
11241225 if not has_update :
11251226 # No application updates
1126- state = self .CMIS_STATE_READY
1127- self .log_notice ("{}: state={}" .format (lport , state ))
1128- self .port_dict [lport ]['cmis_state' ] = state
1227+ self .log_notice ("{}: READY" .format (lport ))
1228+ self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_READY
11291229 continue
1130- appl = self .get_cmis_application_desired (api , host_lanes , host_speed )
1131- self .port_dict [lport ]['cmis_apsel' ] = appl
11321230
11331231 # D.2.2 Software Deinitialization
11341232 api .set_datapath_deinit (host_lanes )
11351233 api .set_lpmode (True )
1234+ if not self .test_module_state (api , ['ModuleReady' , 'ModuleLowPwr' ]):
1235+ self .log_notice ("{}: unable to enter low-power mode" .format (lport ))
1236+ self .port_dict [lport ]['cmis_retries' ] = retries + 1
1237+ continue
1238+
11361239 # D.1.3 Software Configuration and Initialization
1137- api .tx_disable_channel (host_lanes , True )
1240+ if not api .tx_disable_channel (host_lanes , True ):
1241+ self .log_notice ("{}: unable to turn off tx power" .format (lport ))
1242+ self .port_dict [lport ]['cmis_retries' ] = retries + 1
1243+ continue
11381244 api .set_lpmode (False )
11391245
1246+ # TODO: Use fine grained time when the CMIS memory map is available
1247+ self .port_dict [lport ]['cmis_expired' ] = now + datetime .timedelta (seconds = self .CMIS_DEF_EXPIRED )
11401248 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_DP_DEINIT
11411249 elif state == self .CMIS_STATE_DP_DEINIT :
1142- if api .get_module_state () != 'ModuleReady' :
1250+ if not self .test_module_state (api , ['ModuleReady' ]):
1251+ if (expired is not None ) and (expired <= now ):
1252+ self .log_notice ("{}: timeout for 'ModuleReady'" .format (lport ))
1253+ self .reset_cmis_init (lport , retries + 1 )
1254+ continue
1255+ if not self .test_datapath_state (api , host_lanes , ['DataPathDeinit' , 'DataPathDeactivated' ]):
1256+ if (expired is not None ) and (expired <= now ):
1257+ self .log_notice ("{}: timeout for 'DataPathDeinit'" .format (lport ))
1258+ self .reset_cmis_init (lport , retries + 1 )
11431259 continue
11441260
11451261 # D.1.3 Software Configuration and Initialization
1146- api .set_application (host_lanes , self .port_dict [lport ]['cmis_apsel' ])
1262+ appl = self .get_cmis_application_desired (api , host_lanes , host_speed )
1263+ if appl < 1 :
1264+ self .log_error ("{}: no suitable app for the port" .format (lport ))
1265+ self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_FAILED
1266+ continue
1267+
1268+ if not api .set_application (host_lanes , appl ):
1269+ self .log_notice ("{}: unable to set application" .format (lport ))
1270+ self .reset_cmis_init (lport , retries + 1 )
1271+ continue
11471272
1273+ # TODO: Use fine grained time when the CMIS memory map is available
1274+ self .port_dict [lport ]['cmis_expired' ] = now + datetime .timedelta (seconds = self .CMIS_DEF_EXPIRED )
11481275 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_AP_CONF
11491276 elif state == self .CMIS_STATE_AP_CONF :
1150- st = api .get_config_datapath_hostlane_status ()
1151- done = True
1152- for lane in range (self .NUM_CHANNELS ):
1153- if ((1 << lane ) & host_lanes ) == 0 :
1154- continue
1155- name = "ConfigStatusLane{}" .format (lane + 1 )
1156- if st [name ] != 'ConfigSuccess' :
1157- done = False
1158- continue
1159- if not done :
1277+ if not self .test_config_error (api , host_lanes , ['ConfigSuccess' ]):
1278+ if (expired is not None ) and (expired <= now ):
1279+ self .log_notice ("{}: timeout for 'ConfigSuccess'" .format (lport ))
1280+ self .reset_cmis_init (lport , retries + 1 )
11601281 continue
11611282
11621283 # D.1.3 Software Configuration and Initialization
11631284 api .set_datapath_init (host_lanes )
11641285
1286+ # TODO: Use fine grained time when the CMIS memory map is available
1287+ self .port_dict [lport ]['cmis_expired' ] = now + datetime .timedelta (seconds = self .CMIS_DEF_EXPIRED )
11651288 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_DP_INIT
11661289 elif state == self .CMIS_STATE_DP_INIT :
1167- st = api .get_datapath_state ()
1168- done = True
1169- for lane in range (self .NUM_CHANNELS ):
1170- if ((1 << lane ) & host_lanes ) == 0 :
1171- continue
1172- name = "DP{}State" .format (lane + 1 )
1173- if st [name ] != 'DataPathInitialized' :
1174- done = False
1175- continue
1176- if not done :
1290+ if not self .test_datapath_state (api , host_lanes , ['DataPathInitialized' ]):
1291+ if (expired is not None ) and (expired <= now ):
1292+ self .log_notice ("{}: timeout for 'DataPathInitialized'" .format (lport ))
1293+ self .reset_cmis_init (lport , retries + 1 )
11771294 continue
11781295
11791296 # D.1.3 Software Configuration and Initialization
1180- api .tx_disable_channel (host_lanes , False )
1297+ if not api .tx_disable_channel (host_lanes , False ):
1298+ self .log_notice ("{}: unable to turn on tx power" .format (lport ))
1299+ self .reset_cmis_init (lport , retries + 1 )
1300+ continue
11811301
1302+ # TODO: Use fine grained timeout when the CMIS memory map is available
1303+ self .port_dict [lport ]['cmis_expired' ] = now + datetime .timedelta (seconds = self .CMIS_DEF_EXPIRED )
11821304 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_DP_TXON
11831305 elif state == self .CMIS_STATE_DP_TXON :
1184- st = api .get_datapath_state ()
1185- done = True
1186- for lane in range (self .NUM_CHANNELS ):
1187- if ((1 << lane ) & host_lanes ) == 0 :
1188- continue
1189- name = "DP{}State" .format (lane + 1 )
1190- if st [name ] != 'DataPathActivated' :
1191- done = False
1192- continue
1193- if not done :
1306+ if not self .test_datapath_state (api , host_lanes , ['DataPathActivated' ]):
1307+ if (expired is not None ) and (expired <= now ):
1308+ self .log_notice ("{}: timeout for 'DataPathActivated'" .format (lport ))
1309+ self .reset_cmis_init (lport , retries + 1 )
11941310 continue
1195- state = self .CMIS_STATE_READY
1196- self .log_notice ("{}: state={}" .format (lport , state ))
1197- self .port_dict [lport ]['cmis_state' ] = state
1311+ self .log_notice ("{}: READY" .format (lport ))
1312+ self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_READY
11981313
11991314 except (NotImplementedError , AttributeError ):
1315+ self .log_error ("{}: internal errors" .format (lport ))
12001316 self .port_dict [lport ]['cmis_state' ] = self .CMIS_STATE_FAILED
12011317
12021318 self .log_notice ("Stopped" )
0 commit comments