Skip to content

Commit b017bd3

Browse files
authored
Permanent isolate a fabric port if it repeatedly flapping. (sonic-net#3933)
If a fabric port repeatedly and rapidly transitions between the isolate and unisolate states, resulting in instability, the algorithm places the link in a permanent isolated state. Currently, the threshold for triggering this condition is when a link flaps three times within a two-hour period. Recovery from this state requires manual user intervention via a CLI command: config fabric port unisolate -n asicX --force HLD change is at:
1 parent b426b2b commit b017bd3

3 files changed

Lines changed: 200 additions & 54 deletions

File tree

orchagent/fabricportsorch.cpp

Lines changed: 171 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
#include <chrono>
1616
#include <math.h>
1717

18+
using Clock = std::chrono::system_clock;
19+
using TimePoint = std::chrono::time_point<Clock>;
20+
1821
#define FABRIC_POLLING_INTERVAL_DEFAULT (30)
1922
#define FABRIC_PORT_PREFIX "PORT"
2023
#define FABRIC_PORT_ERROR 0
@@ -23,7 +26,7 @@
2326
#define FABRIC_PORT_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 10000
2427
#define FABRIC_QUEUE_STAT_COUNTER_FLEX_COUNTER_GROUP "FABRIC_QUEUE_STAT_COUNTER"
2528
#define FABRIC_QUEUE_STAT_FLEX_COUNTER_POLLING_INTERVAL_MS 100000
26-
#define FABRIC_DEBUG_POLLING_INTERVAL_DEFAULT (60)
29+
#define FABRIC_DEBUG_POLLING_INTERVAL_DEFAULT (12)
2730
#define FABRIC_MONITOR_DATA "FABRIC_MONITOR_DATA"
2831
#define APPL_FABRIC_PORT_PREFIX "Fabric"
2932
#define SWITCH_DEBUG_COUNTER_FLEX_COUNTER_GROUP "SWITCH_DEBUG_COUNTER"
@@ -32,9 +35,10 @@
3235
#define SWITCH_STANDARD_DROP_COUNTERS "SWITCH_ID"
3336

3437
// constants for link monitoring
38+
#define CHECK_TIME 120
3539
#define MAX_SKIP_CRCERR_ON_LNKUP_POLLS 20
3640
#define MAX_SKIP_FECERR_ON_LNKUP_POLLS 20
37-
// the follow constants will be replaced with the number in config_db
41+
// the follow will be replaced with the number in config_db
3842
#define FEC_ISOLATE_POLLS 2
3943
#define FEC_UNISOLATE_POLLS 8
4044
#define ISOLATION_POLLS_CFG 1
@@ -425,6 +429,7 @@ void FabricPortsOrch::updateFabricDebugCounters()
425429
return;
426430
}
427431
now = time_now.tv_sec;
432+
auto checkTime = Clock::now();
428433

429434
uint64_t fecIsolatedPolls = FEC_ISOLATE_POLLS; // monPollThreshIsolation
430435
uint64_t fecUnisolatePolls = FEC_UNISOLATE_POLLS; // monPollThreshRecovery
@@ -566,6 +571,9 @@ void FabricPortsOrch::updateFabricDebugCounters()
566571
int cfgIsolated = 0;
567572
int isolated = 0;
568573
int origIsolated = 0;
574+
int origPermIsolated = 0;
575+
int permIsolate = 0;
576+
int linkFlap = 0;
569577

570578
// link status
571579
string lnkStatus = "down";
@@ -689,6 +697,12 @@ void FabricPortsOrch::updateFabricDebugCounters()
689697
SWSS_LOG_INFO("port %s currently isolated: %s", key.c_str(),valuePt.c_str());
690698
continue;
691699
}
700+
if (fvField(val) == "PRM_ISOLATED")
701+
{
702+
origPermIsolated = to_uint<uint8_t>(valuePt);
703+
SWSS_LOG_INFO("port %s perm isolated: %s", key.c_str(),valuePt.c_str());
704+
continue;
705+
}
692706
if (fvField(val) == "TEST_CRC_ERRORS")
693707
{
694708
testCrcErrors = std::stoull(valuePt);
@@ -724,7 +738,7 @@ void FabricPortsOrch::updateFabricDebugCounters()
724738
SWSS_LOG_INFO("Port %d lnk down cnt %lld handled: %lld", lane, (long long)lnkDownCnt, (long long)preLnkDwnCnt);
725739
if (lnkDownCnt != preLnkDwnCnt)
726740
{
727-
741+
linkFlap = checkDownCnt(key, checkTime) ? 1 : 0;
728742
bool clearCnt = false;
729743
if (origIsolated == 1 && cfgIsolated == 0)
730744
{
@@ -734,6 +748,11 @@ void FabricPortsOrch::updateFabricDebugCounters()
734748
SWSS_LOG_INFO("port %s about to clear counters.", key.c_str());
735749
SWSS_LOG_INFO("origIsolated %d isolated %d cfgIsolated %d clearCnt %s", origIsolated, isolated, cfgIsolated, clearCnt ? "true":"flase");
736750
clearFabricCnt(lane, clearCnt);
751+
752+
if (linkFlap > 0 )
753+
{
754+
SWSS_LOG_NOTICE("port %s possibly flapping %d", key.c_str(), linkFlap);
755+
}
737756
updateStateDbTable(m_stateTable, key, "PORT_DOWN_COUNT_handled", lnkDownCnt);
738757
continue;
739758
}
@@ -843,71 +862,77 @@ void FabricPortsOrch::updateFabricDebugCounters()
843862
}
844863

845864
// take care serdes link shut state setting
846-
if (lnkStatus == "up")
865+
// debug information
866+
SWSS_LOG_INFO("port %s status up autoIsolated %d",
867+
key.c_str(), autoIsolated);
868+
SWSS_LOG_INFO("consecutivePollsWithErrors %lld consecutivePollsWithFecErrs %lld",
869+
(long long)consecutivePollsWithErrors, (long long)consecutivePollsWithFecErrs);
870+
SWSS_LOG_INFO("consecutivePollsWithNoErrors %lld consecutivePollsWithNoFecErrs %lld",
871+
(long long)consecutivePollsWithNoErrors, (long long)consecutivePollsWithNoFecErrs);
872+
if (autoIsolated == 0 && (consecutivePollsWithErrors >= isolationPollsCfg
873+
|| consecutivePollsWithFecErrs >= fecIsolatedPolls))
847874
{
848-
// debug information
849-
SWSS_LOG_INFO("port %s status up autoIsolated %d",
850-
key.c_str(), autoIsolated);
851-
SWSS_LOG_INFO("consecutivePollsWithErrors %lld consecutivePollsWithFecErrs %lld",
852-
(long long)consecutivePollsWithErrors, (long long)consecutivePollsWithFecErrs);
853-
SWSS_LOG_INFO("consecutivePollsWithNoErrors %lld consecutivePollsWithNoFecErrs %lld",
854-
(long long)consecutivePollsWithNoErrors, (long long)consecutivePollsWithNoFecErrs);
855-
if (autoIsolated == 0 && (consecutivePollsWithErrors >= isolationPollsCfg
856-
|| consecutivePollsWithFecErrs >= fecIsolatedPolls))
857-
{
858-
// Link needs to be isolated.
859-
SWSS_LOG_INFO("port %s auto isolated", key.c_str());
860-
autoIsolated = 1;
861-
updateStateDbTable(m_stateTable, key, "AUTO_ISOLATED", autoIsolated);
862-
SWSS_LOG_NOTICE("port %s set AUTO_ISOLATED %d", key.c_str(), autoIsolated);
863-
}
864-
else if (autoIsolated == 1 && consecutivePollsWithNoErrors >= recoveryPollsCfg
865-
&& consecutivePollsWithNoFecErrs >= fecUnisolatePolls)
875+
// Link needs to be isolated.
876+
SWSS_LOG_INFO("port %s auto isolated", key.c_str());
877+
autoIsolated = 1;
878+
permIsolate = addErrorTime(key, checkTime) ? 1 : 0;
879+
if (origPermIsolated == 1)
866880
{
867-
// Link is isolated, but no longer needs to be.
868-
SWSS_LOG_INFO("port %s healthy again", key.c_str());
869-
autoIsolated = 0;
870-
updateStateDbTable(m_stateTable, key, "AUTO_ISOLATED", autoIsolated);
871-
SWSS_LOG_NOTICE("port %s set AUTO_ISOLATED %d", key.c_str(), autoIsolated);
881+
permIsolate = 1;
872882
}
873-
if (cfgIsolated == 1)
883+
SWSS_LOG_NOTICE("port %s get permIsolated", key.c_str() );
884+
updateStateDbTable(m_stateTable, key, "AUTO_ISOLATED", autoIsolated);
885+
SWSS_LOG_NOTICE("port %s set AUTO_ISOLATED %d", key.c_str(), autoIsolated);
886+
}
887+
else if (autoIsolated == 1 && consecutivePollsWithNoErrors >= recoveryPollsCfg
888+
&& consecutivePollsWithNoFecErrs >= fecUnisolatePolls)
889+
{
890+
// Link is isolated, but no longer needs to be.
891+
SWSS_LOG_INFO("port %s healthy again", key.c_str());
892+
autoIsolated = 0;
893+
updateStateDbTable(m_stateTable, key, "AUTO_ISOLATED", autoIsolated);
894+
SWSS_LOG_NOTICE("port %s set AUTO_ISOLATED %d", key.c_str(), autoIsolated);
895+
}
896+
if (cfgIsolated == 1)
897+
{
898+
isolated = 1;
899+
SWSS_LOG_INFO("port %s keep isolated due to configuation",key.c_str());
900+
}
901+
else
902+
{
903+
if (autoIsolated == 1)
874904
{
875905
isolated = 1;
876-
SWSS_LOG_INFO("port %s keep isolated due to configuation",key.c_str());
906+
SWSS_LOG_INFO("port %s keep isolated due to autoisolation",key.c_str());
877907
}
878908
else
879909
{
880-
if (autoIsolated == 1)
881-
{
882-
isolated = 1;
883-
SWSS_LOG_INFO("port %s keep isolated due to autoisolation",key.c_str());
884-
}
885-
else
886-
{
887-
isolated = 0;
888-
SWSS_LOG_INFO("port %s unisolated",key.c_str());
889-
}
910+
isolated = 0;
911+
SWSS_LOG_INFO("port %s unisolated",key.c_str());
890912
}
891-
// if "ISOLATED" is true, Call SAI api here to actually isolated the link
892-
// if "ISOLATED" is false, Call SAP api to actually unisolate the link
913+
}
914+
// if "ISOLATED" is true, Call SAI api here to actually isolated the link
915+
// if "ISOLATED" is false, Call SAP api to actually unisolate the link
893916

894-
if (origIsolated != isolated)
895-
{
896-
bool setVal = false;
897-
if (isolated == 1)
898-
{
899-
setVal = true;
900-
}
901-
isolateFabricLink(lane, setVal);
902-
}
903-
else
917+
if (permIsolate == 1 || origPermIsolated == 1)
918+
{
919+
isolated = 1;
920+
permIsolate = 1;
921+
SWSS_LOG_INFO("port %s permentantly isolated %d",key.c_str(), permIsolate );
922+
}
923+
924+
if (origIsolated != isolated)
925+
{
926+
bool setVal = false;
927+
if (isolated == 1)
904928
{
905-
SWSS_LOG_INFO( "Same isolation status for %d", lane);
929+
setVal = true;
906930
}
931+
isolateFabricLink(lane, setVal);
907932
}
908933
else
909934
{
910-
SWSS_LOG_INFO("link down");
935+
SWSS_LOG_INFO( "Same isolation status for %d", lane);
911936
}
912937

913938
// Update state_db with link isolation data
@@ -917,6 +942,7 @@ void FabricPortsOrch::updateFabricDebugCounters()
917942
updateStateDbTable(m_stateTable, key, "POLL_WITH_NOFEC_ERRORS", consecutivePollsWithNoFecErrs);
918943
updateStateDbTable(m_stateTable, key, "CONFIG_ISOLATED", cfgIsolated);
919944
updateStateDbTable(m_stateTable, key, "ISOLATED", isolated);
945+
updateStateDbTable(m_stateTable, key, "PRM_ISOLATED", permIsolate);
920946

921947
// Update state_db with error rate
922948
valuePt = to_string(rxCells);
@@ -1498,6 +1524,7 @@ void FabricPortsOrch::doFabricPortTask(Consumer &consumer)
14981524
// CONFIG_ISOLATED 0
14991525
// ISOLATED 0
15001526
// AUTO_ISOLATED 0
1527+
// PRM_ISOLATED 0
15011528
updateStateDbTable(m_stateTable, state_key, "FORCE_UN_ISOLATE", forceIsolateCnt);
15021529
updateStateDbTable(m_stateTable, state_key, "POLL_WITH_ERRORS", m_defaultPollWithErrors);
15031530
updateStateDbTable(m_stateTable, state_key, "POLL_WITH_NO_ERRORS", m_defaultPollWithNoErrors);
@@ -1506,6 +1533,8 @@ void FabricPortsOrch::doFabricPortTask(Consumer &consumer)
15061533
updateStateDbTable(m_stateTable, state_key, "CONFIG_ISOLATED", m_defaultConfigIsolated);
15071534
updateStateDbTable(m_stateTable, state_key, "ISOLATED", m_defaultIsolated);
15081535
updateStateDbTable(m_stateTable, state_key, "AUTO_ISOLATED", m_defaultAutoIsolated);
1536+
updateStateDbTable(m_stateTable, state_key, "PRM_ISOLATED", m_defaultIsolated);
1537+
linkQueues.clear();
15091538

15101539
// unisolate the link
15111540
bool setVal = false;
@@ -1600,3 +1629,92 @@ void FabricPortsOrch::createSwitchDropCounters(void)
16001629

16011630
switch_drop_counter_manager->setCounterIdList(gSwitchId, CounterType::SWITCH_DEBUG, counter_stats);
16021631
}
1632+
1633+
bool FabricPortsOrch::addErrorTime(const std::string& link, TimePoint now)
1634+
{
1635+
bool permIsolate = false;
1636+
auto& timestamps = linkQueues[link];
1637+
std::time_t now_c = Clock::to_time_t(now);
1638+
SWSS_LOG_INFO("link: %s auto isolate at %s", link.c_str(), asctime(gmtime(&now_c)));
1639+
1640+
// Add new timestamp to the queue
1641+
timestamps.push(now);
1642+
// Check if we have at least 3 timestamps, and pop the old timestamps
1643+
auto last = timestamps.back();
1644+
auto first = timestamps.front();
1645+
1646+
auto diff = last - first;
1647+
auto checkPeriod = std::chrono::minutes(CHECK_TIME);
1648+
auto hours = checkPeriod.count();
1649+
SWSS_LOG_INFO("check time window: %lld", static_cast<long long>(hours) );
1650+
while (diff > checkPeriod)
1651+
{
1652+
timestamps.pop(); // Remove old timestamp
1653+
first = timestamps.front();
1654+
diff = std::chrono::duration_cast<std::chrono::minutes>(last - first);
1655+
}
1656+
if (timestamps.size() >= 3)
1657+
{
1658+
first = timestamps.front();
1659+
diff = last - first;
1660+
1661+
if (diff <= checkPeriod)
1662+
{ // If within 2 hours
1663+
permIsolate = true;
1664+
} else {
1665+
SWSS_LOG_INFO("do not perm isolated the link");
1666+
}
1667+
} else {
1668+
SWSS_LOG_INFO("Not enough events yet");
1669+
}
1670+
auto ptTime = std::chrono::duration_cast<std::chrono::minutes>(diff).count();
1671+
if (permIsolate)
1672+
{
1673+
SWSS_LOG_INFO("Event queue size %u isolation within %lld, so perm isolated: %d",
1674+
static_cast<unsigned int>(timestamps.size()),
1675+
static_cast<long long>(ptTime), permIsolate);
1676+
}
1677+
SWSS_LOG_INFO("Add isolation event: check period diff %lld size %u perm: %d",
1678+
static_cast<long long>(ptTime),
1679+
static_cast<unsigned int>(timestamps.size()), permIsolate);
1680+
return permIsolate;
1681+
}
1682+
1683+
// The link status will shows down if the card get removed/power cycled
1684+
// or link actually flaping. If the link get status down too many times
1685+
// during the last several hours, say 2 hours, we consider the links mostly
1686+
// flaky, and will try to isolate the link.
1687+
bool FabricPortsOrch::checkDownCnt(const std::string& link, TimePoint now)
1688+
{
1689+
bool linkFlapped = false;
1690+
1691+
auto& timestamps = linkQueues[link];
1692+
timestamps.push(now);
1693+
1694+
auto last = timestamps.back();
1695+
auto first = timestamps.front();
1696+
auto diff = last - first;
1697+
auto checkPeriod = std::chrono::minutes(CHECK_TIME);
1698+
while (diff > checkPeriod)
1699+
{
1700+
timestamps.pop(); // Remove old timestamp
1701+
first = timestamps.front();
1702+
diff = std::chrono::duration_cast<std::chrono::minutes>(last - first);
1703+
}
1704+
if (timestamps.size() >= 3)
1705+
{
1706+
first = timestamps.front();
1707+
diff = last - first;
1708+
1709+
if (diff <= checkPeriod)
1710+
{ // If within 2 hours
1711+
linkFlapped = true;
1712+
} else {
1713+
SWSS_LOG_INFO("The link down may from peer cards gone");
1714+
}
1715+
} else {
1716+
SWSS_LOG_INFO("Not enough events to check yet");
1717+
}
1718+
1719+
return linkFlapped;
1720+
}

orchagent/fabricportsorch.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
#include "producertable.h"
1010
#include "flex_counter_manager.h"
1111

12+
using Clock = std::chrono::system_clock;
13+
using TimePoint = std::chrono::time_point<Clock>;
14+
1215
#define STATE_FABRIC_CAPACITY_TABLE_NAME "FABRIC_CAPACITY_TABLE"
1316
#define STATE_PORT_CAPACITY_TABLE_NAME "PORT_CAPACITY_TABLE"
1417

@@ -63,6 +66,8 @@ class FabricPortsOrch : public Orch, public Subject
6366
int m_defaultConfigIsolated = 0;
6467
int m_defaultIsolated = 0;
6568
int m_defaultAutoIsolated = 0;
69+
std::unordered_map<std::string, std::queue<TimePoint>> linkQueues;
70+
std::unordered_map<std::string, std::queue<TimePoint>> dnLkQueues;
6671

6772
int getFabricPortList();
6873
void generatePortStats();
@@ -84,6 +89,13 @@ class FabricPortsOrch : public Orch, public Subject
8489
void doTask(Consumer &consumer);
8590
void doFabricPortTask(Consumer &consumer);
8691
void doTask(swss::SelectableTimer &timer);
92+
93+
bool addErrorTime(
94+
const std::string& link,
95+
TimePoint now);
96+
bool checkDownCnt(
97+
const std::string& link,
98+
TimePoint now);
8799
};
88100

89101
#endif /* SWSS_FABRICPORTSORCH_H */

0 commit comments

Comments
 (0)