Skip to content

Commit c79fd93

Browse files
authored
Add fabric capacity monitoring code. (#3097)
* Add fabric capacity monitoring code. * The HLD can be found at https://github.com/sonic-net/SONiC/blob/master/doc/voq/fabric.md The current design does not cover the card hotswap/OIR cases. We will discuss how to handle that in a separate proposal.
1 parent 7612326 commit c79fd93

4 files changed

Lines changed: 260 additions & 2 deletions

File tree

orchagent/fabricportsorch.cpp

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
#define RECOVERY_POLLS_CFG 8
3636
#define ERROR_RATE_CRC_CELLS_CFG 1
3737
#define ERROR_RATE_RX_CELLS_CFG 61035156
38+
#define FABRIC_LINK_RATE 44316
3839

3940
extern sai_object_id_t gSwitchId;
4041
extern sai_switch_api_t *sai_switch_api;
@@ -76,6 +77,7 @@ FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector<table_name_with_pr
7677

7778
m_state_db = shared_ptr<DBConnector>(new DBConnector("STATE_DB", 0));
7879
m_stateTable = unique_ptr<Table>(new Table(m_state_db.get(), APP_FABRIC_PORT_TABLE_NAME));
80+
m_fabricCapacityTable = unique_ptr<Table>(new Table(m_state_db.get(), STATE_FABRIC_CAPACITY_TABLE_NAME));
7981

8082
m_counter_db = shared_ptr<DBConnector>(new DBConnector("COUNTERS_DB", 0));
8183
m_portNameQueueCounterTable = unique_ptr<Table>(new Table(m_counter_db.get(), COUNTERS_FABRIC_QUEUE_NAME_MAP));
@@ -836,6 +838,185 @@ void FabricPortsOrch::updateFabricDebugCounters()
836838
}
837839
}
838840

841+
void FabricPortsOrch::updateFabricCapacity()
842+
{
843+
// Init value for fabric capacity monitoring
844+
int capacity = 0;
845+
int downCapacity = 0;
846+
string lnkStatus = "down";
847+
string configIsolated = "0";
848+
string isolated = "0";
849+
string autoIsolated = "0";
850+
int operating_links = 0;
851+
int total_links = 0;
852+
int threshold = 100;
853+
std::vector<FieldValueTuple> constValues;
854+
string applKey = FABRIC_MONITOR_DATA;
855+
856+
// Get capacity warning threshold from APPL_DB table FABRIC_MONITOR_DATA
857+
// By default, this threshold is 100 (percentage).
858+
bool cfgVal = m_applMonitorConstTable->get("FABRIC_MONITOR_DATA", constValues);
859+
if(!cfgVal)
860+
{
861+
SWSS_LOG_INFO("%s default values not set", applKey.c_str());
862+
}
863+
else
864+
{
865+
SWSS_LOG_INFO("%s has default values", applKey.c_str());
866+
}
867+
string configVal = "1";
868+
for (auto cv : constValues)
869+
{
870+
configVal = fvValue(cv);
871+
if (fvField(cv) == "monCapacityThreshWarn")
872+
{
873+
threshold = stoi(configVal);
874+
SWSS_LOG_INFO("monCapacityThreshWarn: %s %s", configVal.c_str(), fvField(cv).c_str());
875+
continue;
876+
}
877+
}
878+
879+
// Check fabric capacity.
880+
SWSS_LOG_INFO("FabricPortsOrch::updateFabricCapacity start");
881+
for (auto p : m_fabricLanePortMap)
882+
{
883+
int lane = p.first;
884+
string key = FABRIC_PORT_PREFIX + to_string(lane);
885+
std::vector<FieldValueTuple> values;
886+
string valuePt;
887+
888+
// Get fabric serdes link status from STATE_DB
889+
bool exist = m_stateTable->get(key, values);
890+
if (!exist)
891+
{
892+
SWSS_LOG_INFO("No state infor for port %s", key.c_str());
893+
return;
894+
}
895+
for (auto val : values)
896+
{
897+
valuePt = fvValue(val);
898+
if (fvField(val) == "STATUS")
899+
{
900+
lnkStatus = valuePt;
901+
continue;
902+
}
903+
if (fvField(val) == "CONFIG_ISOLATED")
904+
{
905+
configIsolated = valuePt;
906+
continue;
907+
}
908+
if (fvField(val) == "ISOLATED")
909+
{
910+
isolated = valuePt;
911+
continue;
912+
}
913+
if (fvField(val) == "AUTO_ISOLATED")
914+
{
915+
autoIsolated = valuePt;
916+
continue;
917+
}
918+
}
919+
// Calculate total number of serdes link, number of operational links,
920+
// total fabric capacity.
921+
bool linkIssue = false;
922+
if (configIsolated == "1" || isolated == "1" || autoIsolated == "1")
923+
{
924+
linkIssue = true;
925+
}
926+
927+
if (lnkStatus == "down" || linkIssue == true)
928+
{
929+
downCapacity += FABRIC_LINK_RATE;
930+
}
931+
else
932+
{
933+
capacity += FABRIC_LINK_RATE;
934+
operating_links += 1;
935+
}
936+
total_links += 1;
937+
}
938+
939+
SWSS_LOG_INFO("Capacity: %d Missing %d", capacity, downCapacity);
940+
941+
// Get LAST_EVENT from STATE_DB
942+
943+
// Calculate the current capacity to see if
944+
// it is lower or higher than the threshold
945+
string cur_event = "None";
946+
string event = "None";
947+
int expect_links = total_links * threshold / 100;
948+
if (expect_links > operating_links)
949+
{
950+
cur_event = "Lower";
951+
}
952+
else
953+
{
954+
cur_event = "Higher";
955+
}
956+
957+
SWSS_LOG_NOTICE(" total link %d expected link %d oper link %d event %s", total_links, expect_links, operating_links, cur_event.c_str());
958+
959+
// Update the capacity data in this poll to STATE_DB
960+
SWSS_LOG_INFO("Capacity: %d Missing %d", capacity, downCapacity);
961+
962+
string lastEvent = "None";
963+
string lastTime = "Never";
964+
// Get the last event and time that event happend from STATE_DB
965+
bool capacity_data = m_fabricCapacityTable->get("FABRIC_CAPACITY_DATA", constValues);
966+
if (capacity_data)
967+
{
968+
for (auto cv : constValues)
969+
{
970+
if(fvField(cv) == "last_event")
971+
{
972+
lastEvent = fvValue(cv);
973+
continue;
974+
}
975+
if(fvField(cv) == "last_event_time")
976+
{
977+
lastTime = fvValue(cv);
978+
continue;
979+
}
980+
}
981+
}
982+
983+
auto now = std::chrono::system_clock::now();
984+
auto now_s = std::chrono::time_point_cast<std::chrono::seconds>(now);
985+
auto nse = now_s.time_since_epoch();
986+
987+
// If last event is None or higher, but the capacity is lower in this poll,
988+
// update the STATE_DB with the event (lower) and the time.
989+
// If the last event is lower, and the capacity is back to higher than the threshold,
990+
// update the STATE_DB with the event (higher) and the time.
991+
event = lastEvent;
992+
if (cur_event == "Lower")
993+
{
994+
if (lastEvent == "None" || lastEvent == "Higher")
995+
{
996+
event = "Lower";
997+
lastTime = to_string(nse.count());
998+
}
999+
}
1000+
else if (cur_event == "Higher")
1001+
{
1002+
if (lastEvent == "Lower")
1003+
{
1004+
event = "Higher";
1005+
lastTime = to_string(nse.count());
1006+
}
1007+
}
1008+
1009+
// Update STATE_DB
1010+
SWSS_LOG_INFO("FabricPortsOrch::updateFabricCapacity now update STATE_DB");
1011+
m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "fabric_capacity", to_string(capacity));
1012+
m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "missing_capacity", to_string(downCapacity));
1013+
m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "operating_links", to_string(operating_links));
1014+
m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "number_of_links", to_string(total_links));
1015+
m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "warning_threshold", to_string(threshold));
1016+
m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "last_event", event);
1017+
m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "last_event_time", lastTime);
1018+
}
1019+
8391020
void FabricPortsOrch::doTask()
8401021
{
8411022
}
@@ -1039,6 +1220,7 @@ void FabricPortsOrch::doTask(swss::SelectableTimer &timer)
10391220
if (m_getFabricPortListDone)
10401221
{
10411222
updateFabricDebugCounters();
1223+
updateFabricCapacity();
10421224
}
10431225
}
10441226
}

orchagent/fabricportsorch.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
#include "producertable.h"
1010
#include "flex_counter_manager.h"
1111

12+
#define STATE_FABRIC_CAPACITY_TABLE_NAME "FABRIC_CAPACITY_TABLE"
13+
#define STATE_PORT_CAPACITY_TABLE_NAME "PORT_CAPACITY_TABLE"
14+
1215
class FabricPortsOrch : public Orch, public Subject
1316
{
1417
public:
@@ -31,6 +34,7 @@ class FabricPortsOrch : public Orch, public Subject
3134
unique_ptr<Table> m_portNamePortCounterTable;
3235
unique_ptr<Table> m_fabricCounterTable;
3336
unique_ptr<Table> m_applTable;
37+
unique_ptr<Table> m_fabricCapacityTable;
3438
unique_ptr<Table> m_applMonitorConstTable;
3539
unique_ptr<ProducerTable> m_flexCounterTable;
3640

@@ -61,6 +65,7 @@ class FabricPortsOrch : public Orch, public Subject
6165
void generatePortStats();
6266
void updateFabricPortState();
6367
void updateFabricDebugCounters();
68+
void updateFabricCapacity();
6469

6570
void doTask() override;
6671
void doTask(Consumer &consumer);

tests/test_fabric_capacity.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import random
2+
from dvslib.dvs_database import DVSDatabase
3+
from dvslib.dvs_common import PollingConfig
4+
5+
6+
class TestVirtualChassis(object):
7+
def test_voq_switch_fabric_capacity(self, vst):
8+
"""Test basic fabric capacity infrastructure in VOQ switchs.
9+
10+
This test validates that when fabric links get isolated, the fabric capacity
11+
get updated in the state_db.
12+
When the link get unisolated, the fabric capacity get set back as well.
13+
"""
14+
15+
dvss = vst.dvss
16+
for name in dvss.keys():
17+
dvs = dvss[name]
18+
# Get the config information and choose a linecard or fabric card to test.
19+
config_db = dvs.get_config_db()
20+
metatbl = config_db.get_entry("DEVICE_METADATA", "localhost")
21+
22+
cfg_switch_type = metatbl.get("switch_type")
23+
if cfg_switch_type == "fabric":
24+
25+
# get state_db infor
26+
sdb = dvs.get_state_db()
27+
# There are 16 fabric ports in the test environment.
28+
# Choose one link to test.
29+
portNum = random.randint(1, 16)
30+
cdb_port = "Fabric"+str(portNum)
31+
sdb_port = "PORT"+str(portNum)
32+
33+
max_poll = PollingConfig(polling_interval=60, timeout=600, strict=True)
34+
35+
# setup test environment
36+
sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST": "TEST"})
37+
38+
# get current fabric capacity
39+
capacity = sdb.get_entry("FABRIC_CAPACITY_TABLE", "FABRIC_CAPACITY_DATA")['operating_links']
40+
if sdb.get_entry("FABRIC_PORT_TABLE", sdb_port)['STATUS'] == 'up':
41+
try:
42+
# clean up the testing port.
43+
# set TEST_CRC_ERRORS to 0
44+
# set TEST_CODE_ERRORS to 0
45+
sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CRC_ERRORS":"0"})
46+
sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CODE_ERRORS": "0"})
47+
48+
# isolate the link from config_db
49+
config_db.update_entry("FABRIC_PORT", cdb_port, {"isolateStatus": "True"})
50+
sdb.wait_for_field_match("FABRIC_PORT_TABLE", sdb_port, {"ISOLATED": "1"}, polling_config=max_poll)
51+
# check if capacity reduced
52+
sdb.wait_for_field_negative_match("FABRIC_CAPACITY_TABLE", "FABRIC_CAPACITY_DATA", {'operating_links': capacity}, polling_config=max_poll)
53+
# unisolate the link from config_db
54+
config_db.update_entry("FABRIC_PORT", cdb_port, {"isolateStatus": "False"})
55+
sdb.wait_for_field_match("FABRIC_PORT_TABLE", sdb_port, {"ISOLATED": "0"}, polling_config=max_poll)
56+
sdb.wait_for_field_match("FABRIC_CAPACITY_TABLE", "FABRIC_CAPACITY_DATA", {'operating_links': capacity}, polling_config=max_poll)
57+
finally:
58+
# cleanup
59+
sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CRC_ERRORS": "0"})
60+
sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CODE_ERRORS": "0"})
61+
sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST": "product"})
62+
else:
63+
print("The link ", port, " is down")
64+
else:
65+
print("We do not check switch type:", cfg_switch_type)
66+
67+
68+
# Add Dummy always-pass test at end as workaroud
69+
# for issue when Flaky fail on final test it invokes module tear-down before retrying
70+
def test_nonflaky_dummy():
71+
pass
72+

tests/test_fabric_port_isolation.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,17 +29,16 @@ def test_voq_switch_fabric_link(self, vst):
2929
portNum = random.randint(1, 16)
3030
port = "PORT"+str(portNum)
3131
# wait for link monitoring algorithm skips init pollings
32+
sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST": "TEST"})
3233
max_poll = PollingConfig(polling_interval=60, timeout=1200, strict=True)
3334
if sdb.get_entry("FABRIC_PORT_TABLE", port)['STATUS'] == 'up':
34-
sdb.wait_for_field_match("FABRIC_PORT_TABLE", port, {"SKIP_FEC_ERR_ON_LNKUP_CNT": "2"}, polling_config=max_poll)
3535
try:
3636
# clean up the system for the testing port.
3737
# set TEST_CRC_ERRORS to 0
3838
# set TEST_CODE_ERRORS to 0
3939
# set TEST to "TEST"
4040
sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST_CRC_ERRORS":"0"})
4141
sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST_CODE_ERRORS": "0"})
42-
sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST": "TEST"})
4342
# inject testing errors and wait for link get isolated.
4443
sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST_CRC_ERRORS": "2"})
4544
sdb.wait_for_field_match("FABRIC_PORT_TABLE", port, {"AUTO_ISOLATED": "1"}, polling_config=max_poll)

0 commit comments

Comments
 (0)