1515#include < chrono>
1616#include < math.h>
1717
18+ using Clock = std::chrono::system_clock;
19+ using TimePoint = std::chrono::time_point<Clock>;
20+
1821#define FABRIC_POLLING_INTERVAL_DEFAULT (30 )
1922#define FABRIC_PORT_PREFIX " PORT"
2023#define FABRIC_PORT_ERROR 0
2326#define FABRIC_PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 10000
2427#define FABRIC_QUEUE_STAT_COUNTER_FLEX_COUNTER_GROUP " FABRIC_QUEUE_STAT_COUNTER"
2528#define FABRIC_QUEUE_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 100000
26- #define FABRIC_DEBUG_POLLING_INTERVAL_DEFAULT (60 )
29+ #define FABRIC_DEBUG_POLLING_INTERVAL_DEFAULT (12 )
2730#define FABRIC_MONITOR_DATA " FABRIC_MONITOR_DATA"
2831#define APPL_FABRIC_PORT_PREFIX " Fabric"
2932#define SWITCH_DEBUG_COUNTER_FLEX_COUNTER_GROUP " SWITCH_DEBUG_COUNTER"
3235#define SWITCH_STANDARD_DROP_COUNTERS " SWITCH_ID"
3336
3437// constants for link monitoring
38+ #define CHECK_TIME 120
3539#define MAX_SKIP_CRCERR_ON_LNKUP_POLLS 20
3640#define MAX_SKIP_FECERR_ON_LNKUP_POLLS 20
37- // the follow constants will be replaced with the number in config_db
41+ // the follow will be replaced with the number in config_db
3842#define FEC_ISOLATE_POLLS 2
3943#define FEC_UNISOLATE_POLLS 8
4044#define ISOLATION_POLLS_CFG 1
@@ -425,6 +429,7 @@ void FabricPortsOrch::updateFabricDebugCounters()
425429 return ;
426430 }
427431 now = time_now.tv_sec ;
432+ auto checkTime = Clock::now ();
428433
429434 uint64_t fecIsolatedPolls = FEC_ISOLATE_POLLS; // monPollThreshIsolation
430435 uint64_t fecUnisolatePolls = FEC_UNISOLATE_POLLS; // monPollThreshRecovery
@@ -566,6 +571,9 @@ void FabricPortsOrch::updateFabricDebugCounters()
566571 int cfgIsolated = 0 ;
567572 int isolated = 0 ;
568573 int origIsolated = 0 ;
574+ int origPermIsolated = 0 ;
575+ int permIsolate = 0 ;
576+ int linkFlap = 0 ;
569577
570578 // link status
571579 string lnkStatus = " down" ;
@@ -689,6 +697,12 @@ void FabricPortsOrch::updateFabricDebugCounters()
689697 SWSS_LOG_INFO (" port %s currently isolated: %s" , key.c_str (),valuePt.c_str ());
690698 continue ;
691699 }
700+ if (fvField (val) == " PRM_ISOLATED" )
701+ {
702+ origPermIsolated = to_uint<uint8_t >(valuePt);
703+ SWSS_LOG_INFO (" port %s perm isolated: %s" , key.c_str (),valuePt.c_str ());
704+ continue ;
705+ }
692706 if (fvField (val) == " TEST_CRC_ERRORS" )
693707 {
694708 testCrcErrors = std::stoull (valuePt);
@@ -724,7 +738,7 @@ void FabricPortsOrch::updateFabricDebugCounters()
724738 SWSS_LOG_INFO (" Port %d lnk down cnt %lld handled: %lld" , lane, (long long )lnkDownCnt, (long long )preLnkDwnCnt);
725739 if (lnkDownCnt != preLnkDwnCnt)
726740 {
727-
741+ linkFlap = checkDownCnt (key, checkTime) ? 1 : 0 ;
728742 bool clearCnt = false ;
729743 if (origIsolated == 1 && cfgIsolated == 0 )
730744 {
@@ -734,6 +748,11 @@ void FabricPortsOrch::updateFabricDebugCounters()
734748 SWSS_LOG_INFO (" port %s about to clear counters." , key.c_str ());
735749 SWSS_LOG_INFO (" origIsolated %d isolated %d cfgIsolated %d clearCnt %s" , origIsolated, isolated, cfgIsolated, clearCnt ? " true" :" flase" );
736750 clearFabricCnt (lane, clearCnt);
751+
752+ if (linkFlap > 0 )
753+ {
754+ SWSS_LOG_NOTICE (" port %s possibly flapping %d" , key.c_str (), linkFlap);
755+ }
737756 updateStateDbTable (m_stateTable, key, " PORT_DOWN_COUNT_handled" , lnkDownCnt);
738757 continue ;
739758 }
@@ -843,71 +862,77 @@ void FabricPortsOrch::updateFabricDebugCounters()
843862 }
844863
845864 // take care serdes link shut state setting
846- if (lnkStatus == " up" )
865+ // debug information
866+ SWSS_LOG_INFO (" port %s status up autoIsolated %d" ,
867+ key.c_str (), autoIsolated);
868+ SWSS_LOG_INFO (" consecutivePollsWithErrors %lld consecutivePollsWithFecErrs %lld" ,
869+ (long long )consecutivePollsWithErrors, (long long )consecutivePollsWithFecErrs);
870+ SWSS_LOG_INFO (" consecutivePollsWithNoErrors %lld consecutivePollsWithNoFecErrs %lld" ,
871+ (long long )consecutivePollsWithNoErrors, (long long )consecutivePollsWithNoFecErrs);
872+ if (autoIsolated == 0 && (consecutivePollsWithErrors >= isolationPollsCfg
873+ || consecutivePollsWithFecErrs >= fecIsolatedPolls))
847874 {
848- // debug information
849- SWSS_LOG_INFO (" port %s status up autoIsolated %d" ,
850- key.c_str (), autoIsolated);
851- SWSS_LOG_INFO (" consecutivePollsWithErrors %lld consecutivePollsWithFecErrs %lld" ,
852- (long long )consecutivePollsWithErrors, (long long )consecutivePollsWithFecErrs);
853- SWSS_LOG_INFO (" consecutivePollsWithNoErrors %lld consecutivePollsWithNoFecErrs %lld" ,
854- (long long )consecutivePollsWithNoErrors, (long long )consecutivePollsWithNoFecErrs);
855- if (autoIsolated == 0 && (consecutivePollsWithErrors >= isolationPollsCfg
856- || consecutivePollsWithFecErrs >= fecIsolatedPolls))
857- {
858- // Link needs to be isolated.
859- SWSS_LOG_INFO (" port %s auto isolated" , key.c_str ());
860- autoIsolated = 1 ;
861- updateStateDbTable (m_stateTable, key, " AUTO_ISOLATED" , autoIsolated);
862- SWSS_LOG_NOTICE (" port %s set AUTO_ISOLATED %d" , key.c_str (), autoIsolated);
863- }
864- else if (autoIsolated == 1 && consecutivePollsWithNoErrors >= recoveryPollsCfg
865- && consecutivePollsWithNoFecErrs >= fecUnisolatePolls)
875+ // Link needs to be isolated.
876+ SWSS_LOG_INFO (" port %s auto isolated" , key.c_str ());
877+ autoIsolated = 1 ;
878+ permIsolate = addErrorTime (key, checkTime) ? 1 : 0 ;
879+ if (origPermIsolated == 1 )
866880 {
867- // Link is isolated, but no longer needs to be.
868- SWSS_LOG_INFO (" port %s healthy again" , key.c_str ());
869- autoIsolated = 0 ;
870- updateStateDbTable (m_stateTable, key, " AUTO_ISOLATED" , autoIsolated);
871- SWSS_LOG_NOTICE (" port %s set AUTO_ISOLATED %d" , key.c_str (), autoIsolated);
881+ permIsolate = 1 ;
872882 }
873- if (cfgIsolated == 1 )
883+ SWSS_LOG_NOTICE (" port %s get permIsolated" , key.c_str () );
884+ updateStateDbTable (m_stateTable, key, " AUTO_ISOLATED" , autoIsolated);
885+ SWSS_LOG_NOTICE (" port %s set AUTO_ISOLATED %d" , key.c_str (), autoIsolated);
886+ }
887+ else if (autoIsolated == 1 && consecutivePollsWithNoErrors >= recoveryPollsCfg
888+ && consecutivePollsWithNoFecErrs >= fecUnisolatePolls)
889+ {
890+ // Link is isolated, but no longer needs to be.
891+ SWSS_LOG_INFO (" port %s healthy again" , key.c_str ());
892+ autoIsolated = 0 ;
893+ updateStateDbTable (m_stateTable, key, " AUTO_ISOLATED" , autoIsolated);
894+ SWSS_LOG_NOTICE (" port %s set AUTO_ISOLATED %d" , key.c_str (), autoIsolated);
895+ }
896+ if (cfgIsolated == 1 )
897+ {
898+ isolated = 1 ;
899+ SWSS_LOG_INFO (" port %s keep isolated due to configuation" ,key.c_str ());
900+ }
901+ else
902+ {
903+ if (autoIsolated == 1 )
874904 {
875905 isolated = 1 ;
876- SWSS_LOG_INFO (" port %s keep isolated due to configuation " ,key.c_str ());
906+ SWSS_LOG_INFO (" port %s keep isolated due to autoisolation " ,key.c_str ());
877907 }
878908 else
879909 {
880- if (autoIsolated == 1 )
881- {
882- isolated = 1 ;
883- SWSS_LOG_INFO (" port %s keep isolated due to autoisolation" ,key.c_str ());
884- }
885- else
886- {
887- isolated = 0 ;
888- SWSS_LOG_INFO (" port %s unisolated" ,key.c_str ());
889- }
910+ isolated = 0 ;
911+ SWSS_LOG_INFO (" port %s unisolated" ,key.c_str ());
890912 }
891- // if "ISOLATED" is true, Call SAI api here to actually isolated the link
892- // if "ISOLATED" is false, Call SAP api to actually unisolate the link
913+ }
914+ // if "ISOLATED" is true, Call SAI api here to actually isolated the link
915+ // if "ISOLATED" is false, Call SAP api to actually unisolate the link
893916
894- if (origIsolated != isolated)
895- {
896- bool setVal = false ;
897- if (isolated == 1 )
898- {
899- setVal = true ;
900- }
901- isolateFabricLink (lane, setVal);
902- }
903- else
917+ if (permIsolate == 1 || origPermIsolated == 1 )
918+ {
919+ isolated = 1 ;
920+ permIsolate = 1 ;
921+ SWSS_LOG_INFO (" port %s permentantly isolated %d" ,key.c_str (), permIsolate );
922+ }
923+
924+ if (origIsolated != isolated)
925+ {
926+ bool setVal = false ;
927+ if (isolated == 1 )
904928 {
905- SWSS_LOG_INFO ( " Same isolation status for %d " , lane) ;
929+ setVal = true ;
906930 }
931+ isolateFabricLink (lane, setVal);
907932 }
908933 else
909934 {
910- SWSS_LOG_INFO (" link down " );
935+ SWSS_LOG_INFO ( " Same isolation status for %d " , lane );
911936 }
912937
913938 // Update state_db with link isolation data
@@ -917,6 +942,7 @@ void FabricPortsOrch::updateFabricDebugCounters()
917942 updateStateDbTable (m_stateTable, key, " POLL_WITH_NOFEC_ERRORS" , consecutivePollsWithNoFecErrs);
918943 updateStateDbTable (m_stateTable, key, " CONFIG_ISOLATED" , cfgIsolated);
919944 updateStateDbTable (m_stateTable, key, " ISOLATED" , isolated);
945+ updateStateDbTable (m_stateTable, key, " PRM_ISOLATED" , permIsolate);
920946
921947 // Update state_db with error rate
922948 valuePt = to_string (rxCells);
@@ -1498,6 +1524,7 @@ void FabricPortsOrch::doFabricPortTask(Consumer &consumer)
14981524 // CONFIG_ISOLATED 0
14991525 // ISOLATED 0
15001526 // AUTO_ISOLATED 0
1527+ // PRM_ISOLATED 0
15011528 updateStateDbTable (m_stateTable, state_key, " FORCE_UN_ISOLATE" , forceIsolateCnt);
15021529 updateStateDbTable (m_stateTable, state_key, " POLL_WITH_ERRORS" , m_defaultPollWithErrors);
15031530 updateStateDbTable (m_stateTable, state_key, " POLL_WITH_NO_ERRORS" , m_defaultPollWithNoErrors);
@@ -1506,6 +1533,8 @@ void FabricPortsOrch::doFabricPortTask(Consumer &consumer)
15061533 updateStateDbTable (m_stateTable, state_key, " CONFIG_ISOLATED" , m_defaultConfigIsolated);
15071534 updateStateDbTable (m_stateTable, state_key, " ISOLATED" , m_defaultIsolated);
15081535 updateStateDbTable (m_stateTable, state_key, " AUTO_ISOLATED" , m_defaultAutoIsolated);
1536+ updateStateDbTable (m_stateTable, state_key, " PRM_ISOLATED" , m_defaultIsolated);
1537+ linkQueues.clear ();
15091538
15101539 // unisolate the link
15111540 bool setVal = false ;
@@ -1600,3 +1629,92 @@ void FabricPortsOrch::createSwitchDropCounters(void)
16001629
16011630 switch_drop_counter_manager->setCounterIdList (gSwitchId , CounterType::SWITCH_DEBUG, counter_stats);
16021631}
1632+
1633+ bool FabricPortsOrch::addErrorTime (const std::string& link, TimePoint now)
1634+ {
1635+ bool permIsolate = false ;
1636+ auto & timestamps = linkQueues[link];
1637+ std::time_t now_c = Clock::to_time_t (now);
1638+ SWSS_LOG_INFO (" link: %s auto isolate at %s" , link.c_str (), asctime (gmtime (&now_c)));
1639+
1640+ // Add new timestamp to the queue
1641+ timestamps.push (now);
1642+ // Check if we have at least 3 timestamps, and pop the old timestamps
1643+ auto last = timestamps.back ();
1644+ auto first = timestamps.front ();
1645+
1646+ auto diff = last - first;
1647+ auto checkPeriod = std::chrono::minutes (CHECK_TIME);
1648+ auto hours = checkPeriod.count ();
1649+ SWSS_LOG_INFO (" check time window: %lld" , static_cast <long long >(hours) );
1650+ while (diff > checkPeriod)
1651+ {
1652+ timestamps.pop (); // Remove old timestamp
1653+ first = timestamps.front ();
1654+ diff = std::chrono::duration_cast<std::chrono::minutes>(last - first);
1655+ }
1656+ if (timestamps.size () >= 3 )
1657+ {
1658+ first = timestamps.front ();
1659+ diff = last - first;
1660+
1661+ if (diff <= checkPeriod)
1662+ { // If within 2 hours
1663+ permIsolate = true ;
1664+ } else {
1665+ SWSS_LOG_INFO (" do not perm isolated the link" );
1666+ }
1667+ } else {
1668+ SWSS_LOG_INFO (" Not enough events yet" );
1669+ }
1670+ auto ptTime = std::chrono::duration_cast<std::chrono::minutes>(diff).count ();
1671+ if (permIsolate)
1672+ {
1673+ SWSS_LOG_INFO (" Event queue size %u isolation within %lld, so perm isolated: %d" ,
1674+ static_cast <unsigned int >(timestamps.size ()),
1675+ static_cast <long long >(ptTime), permIsolate);
1676+ }
1677+ SWSS_LOG_INFO (" Add isolation event: check period diff %lld size %u perm: %d" ,
1678+ static_cast <long long >(ptTime),
1679+ static_cast <unsigned int >(timestamps.size ()), permIsolate);
1680+ return permIsolate;
1681+ }
1682+
1683+ // The link status will shows down if the card get removed/power cycled
1684+ // or link actually flaping. If the link get status down too many times
1685+ // during the last several hours, say 2 hours, we consider the links mostly
1686+ // flaky, and will try to isolate the link.
1687+ bool FabricPortsOrch::checkDownCnt (const std::string& link, TimePoint now)
1688+ {
1689+ bool linkFlapped = false ;
1690+
1691+ auto & timestamps = linkQueues[link];
1692+ timestamps.push (now);
1693+
1694+ auto last = timestamps.back ();
1695+ auto first = timestamps.front ();
1696+ auto diff = last - first;
1697+ auto checkPeriod = std::chrono::minutes (CHECK_TIME);
1698+ while (diff > checkPeriod)
1699+ {
1700+ timestamps.pop (); // Remove old timestamp
1701+ first = timestamps.front ();
1702+ diff = std::chrono::duration_cast<std::chrono::minutes>(last - first);
1703+ }
1704+ if (timestamps.size () >= 3 )
1705+ {
1706+ first = timestamps.front ();
1707+ diff = last - first;
1708+
1709+ if (diff <= checkPeriod)
1710+ { // If within 2 hours
1711+ linkFlapped = true ;
1712+ } else {
1713+ SWSS_LOG_INFO (" The link down may from peer cards gone" );
1714+ }
1715+ } else {
1716+ SWSS_LOG_INFO (" Not enough events to check yet" );
1717+ }
1718+
1719+ return linkFlapped;
1720+ }
0 commit comments