diff --git a/orchagent/fabricportsorch.cpp b/orchagent/fabricportsorch.cpp index b46fcede092..d61f5d13f20 100644 --- a/orchagent/fabricportsorch.cpp +++ b/orchagent/fabricportsorch.cpp @@ -35,6 +35,7 @@ #define RECOVERY_POLLS_CFG 8 #define ERROR_RATE_CRC_CELLS_CFG 1 #define ERROR_RATE_RX_CELLS_CFG 61035156 +#define FABRIC_LINK_RATE 44316 extern sai_object_id_t gSwitchId; extern sai_switch_api_t *sai_switch_api; @@ -76,6 +77,7 @@ FabricPortsOrch::FabricPortsOrch(DBConnector *appl_db, vector(new DBConnector("STATE_DB", 0)); m_stateTable = unique_ptr(new Table(m_state_db.get(), APP_FABRIC_PORT_TABLE_NAME)); + m_fabricCapacityTable = unique_ptr
(new Table(m_state_db.get(), STATE_FABRIC_CAPACITY_TABLE_NAME)); m_counter_db = shared_ptr(new DBConnector("COUNTERS_DB", 0)); m_portNameQueueCounterTable = unique_ptr
(new Table(m_counter_db.get(), COUNTERS_FABRIC_QUEUE_NAME_MAP)); @@ -836,6 +838,185 @@ void FabricPortsOrch::updateFabricDebugCounters() } } +void FabricPortsOrch::updateFabricCapacity() +{ + // Init value for fabric capacity monitoring + int capacity = 0; + int downCapacity = 0; + string lnkStatus = "down"; + string configIsolated = "0"; + string isolated = "0"; + string autoIsolated = "0"; + int operating_links = 0; + int total_links = 0; + int threshold = 100; + std::vector constValues; + string applKey = FABRIC_MONITOR_DATA; + + // Get capacity warning threshold from APPL_DB table FABRIC_MONITOR_DATA + // By default, this threshold is 100 (percentage). + bool cfgVal = m_applMonitorConstTable->get("FABRIC_MONITOR_DATA", constValues); + if(!cfgVal) + { + SWSS_LOG_INFO("%s default values not set", applKey.c_str()); + } + else + { + SWSS_LOG_INFO("%s has default values", applKey.c_str()); + } + string configVal = "1"; + for (auto cv : constValues) + { + configVal = fvValue(cv); + if (fvField(cv) == "monCapacityThreshWarn") + { + threshold = stoi(configVal); + SWSS_LOG_INFO("monCapacityThreshWarn: %s %s", configVal.c_str(), fvField(cv).c_str()); + continue; + } + } + + // Check fabric capacity. + SWSS_LOG_INFO("FabricPortsOrch::updateFabricCapacity start"); + for (auto p : m_fabricLanePortMap) + { + int lane = p.first; + string key = FABRIC_PORT_PREFIX + to_string(lane); + std::vector values; + string valuePt; + + // Get fabric serdes link status from STATE_DB + bool exist = m_stateTable->get(key, values); + if (!exist) + { + SWSS_LOG_INFO("No state infor for port %s", key.c_str()); + return; + } + for (auto val : values) + { + valuePt = fvValue(val); + if (fvField(val) == "STATUS") + { + lnkStatus = valuePt; + continue; + } + if (fvField(val) == "CONFIG_ISOLATED") + { + configIsolated = valuePt; + continue; + } + if (fvField(val) == "ISOLATED") + { + isolated = valuePt; + continue; + } + if (fvField(val) == "AUTO_ISOLATED") + { + autoIsolated = valuePt; + continue; + } + } + // Calculate total number of serdes link, number of operational links, + // total fabric capacity. + bool linkIssue = false; + if (configIsolated == "1" || isolated == "1" || autoIsolated == "1") + { + linkIssue = true; + } + + if (lnkStatus == "down" || linkIssue == true) + { + downCapacity += FABRIC_LINK_RATE; + } + else + { + capacity += FABRIC_LINK_RATE; + operating_links += 1; + } + total_links += 1; + } + + SWSS_LOG_INFO("Capacity: %d Missing %d", capacity, downCapacity); + + // Get LAST_EVENT from STATE_DB + + // Calculate the current capacity to see if + // it is lower or higher than the threshold + string cur_event = "None"; + string event = "None"; + int expect_links = total_links * threshold / 100; + if (expect_links > operating_links) + { + cur_event = "Lower"; + } + else + { + cur_event = "Higher"; + } + + SWSS_LOG_NOTICE(" total link %d expected link %d oper link %d event %s", total_links, expect_links, operating_links, cur_event.c_str()); + + // Update the capacity data in this poll to STATE_DB + SWSS_LOG_INFO("Capacity: %d Missing %d", capacity, downCapacity); + + string lastEvent = "None"; + string lastTime = "Never"; + // Get the last event and time that event happend from STATE_DB + bool capacity_data = m_fabricCapacityTable->get("FABRIC_CAPACITY_DATA", constValues); + if (capacity_data) + { + for (auto cv : constValues) + { + if(fvField(cv) == "last_event") + { + lastEvent = fvValue(cv); + continue; + } + if(fvField(cv) == "last_event_time") + { + lastTime = fvValue(cv); + continue; + } + } + } + + auto now = std::chrono::system_clock::now(); + auto now_s = std::chrono::time_point_cast(now); + auto nse = now_s.time_since_epoch(); + + // If last event is None or higher, but the capacity is lower in this poll, + // update the STATE_DB with the event (lower) and the time. + // If the last event is lower, and the capacity is back to higher than the threshold, + // update the STATE_DB with the event (higher) and the time. + event = lastEvent; + if (cur_event == "Lower") + { + if (lastEvent == "None" || lastEvent == "Higher") + { + event = "Lower"; + lastTime = to_string(nse.count()); + } + } + else if (cur_event == "Higher") + { + if (lastEvent == "Lower") + { + event = "Higher"; + lastTime = to_string(nse.count()); + } + } + + // Update STATE_DB + SWSS_LOG_INFO("FabricPortsOrch::updateFabricCapacity now update STATE_DB"); + m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "fabric_capacity", to_string(capacity)); + m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "missing_capacity", to_string(downCapacity)); + m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "operating_links", to_string(operating_links)); + m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "number_of_links", to_string(total_links)); + m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "warning_threshold", to_string(threshold)); + m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "last_event", event); + m_fabricCapacityTable->hset("FABRIC_CAPACITY_DATA", "last_event_time", lastTime); +} + void FabricPortsOrch::doTask() { } @@ -1039,6 +1220,7 @@ void FabricPortsOrch::doTask(swss::SelectableTimer &timer) if (m_getFabricPortListDone) { updateFabricDebugCounters(); + updateFabricCapacity(); } } } diff --git a/orchagent/fabricportsorch.h b/orchagent/fabricportsorch.h index 0d637dec43b..e72ae56e3c8 100644 --- a/orchagent/fabricportsorch.h +++ b/orchagent/fabricportsorch.h @@ -9,6 +9,9 @@ #include "producertable.h" #include "flex_counter_manager.h" +#define STATE_FABRIC_CAPACITY_TABLE_NAME "FABRIC_CAPACITY_TABLE" +#define STATE_PORT_CAPACITY_TABLE_NAME "PORT_CAPACITY_TABLE" + class FabricPortsOrch : public Orch, public Subject { public: @@ -31,6 +34,7 @@ class FabricPortsOrch : public Orch, public Subject unique_ptr
m_portNamePortCounterTable; unique_ptr
m_fabricCounterTable; unique_ptr
m_applTable; + unique_ptr
m_fabricCapacityTable; unique_ptr
m_applMonitorConstTable; unique_ptr m_flexCounterTable; @@ -61,6 +65,7 @@ class FabricPortsOrch : public Orch, public Subject void generatePortStats(); void updateFabricPortState(); void updateFabricDebugCounters(); + void updateFabricCapacity(); void doTask() override; void doTask(Consumer &consumer); diff --git a/tests/test_fabric_capacity.py b/tests/test_fabric_capacity.py new file mode 100644 index 00000000000..91bb1b5e944 --- /dev/null +++ b/tests/test_fabric_capacity.py @@ -0,0 +1,72 @@ +import random +from dvslib.dvs_database import DVSDatabase +from dvslib.dvs_common import PollingConfig + + +class TestVirtualChassis(object): + def test_voq_switch_fabric_capacity(self, vst): + """Test basic fabric capacity infrastructure in VOQ switchs. + + This test validates that when fabric links get isolated, the fabric capacity + get updated in the state_db. + When the link get unisolated, the fabric capacity get set back as well. + """ + + dvss = vst.dvss + for name in dvss.keys(): + dvs = dvss[name] + # Get the config information and choose a linecard or fabric card to test. + config_db = dvs.get_config_db() + metatbl = config_db.get_entry("DEVICE_METADATA", "localhost") + + cfg_switch_type = metatbl.get("switch_type") + if cfg_switch_type == "fabric": + + # get state_db infor + sdb = dvs.get_state_db() + # There are 16 fabric ports in the test environment. + # Choose one link to test. + portNum = random.randint(1, 16) + cdb_port = "Fabric"+str(portNum) + sdb_port = "PORT"+str(portNum) + + max_poll = PollingConfig(polling_interval=60, timeout=600, strict=True) + + # setup test environment + sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST": "TEST"}) + + # get current fabric capacity + capacity = sdb.get_entry("FABRIC_CAPACITY_TABLE", "FABRIC_CAPACITY_DATA")['operating_links'] + if sdb.get_entry("FABRIC_PORT_TABLE", sdb_port)['STATUS'] == 'up': + try: + # clean up the testing port. + # set TEST_CRC_ERRORS to 0 + # set TEST_CODE_ERRORS to 0 + sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CRC_ERRORS":"0"}) + sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CODE_ERRORS": "0"}) + + # isolate the link from config_db + config_db.update_entry("FABRIC_PORT", cdb_port, {"isolateStatus": "True"}) + sdb.wait_for_field_match("FABRIC_PORT_TABLE", sdb_port, {"ISOLATED": "1"}, polling_config=max_poll) + # check if capacity reduced + sdb.wait_for_field_negative_match("FABRIC_CAPACITY_TABLE", "FABRIC_CAPACITY_DATA", {'operating_links': capacity}, polling_config=max_poll) + # unisolate the link from config_db + config_db.update_entry("FABRIC_PORT", cdb_port, {"isolateStatus": "False"}) + sdb.wait_for_field_match("FABRIC_PORT_TABLE", sdb_port, {"ISOLATED": "0"}, polling_config=max_poll) + sdb.wait_for_field_match("FABRIC_CAPACITY_TABLE", "FABRIC_CAPACITY_DATA", {'operating_links': capacity}, polling_config=max_poll) + finally: + # cleanup + sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CRC_ERRORS": "0"}) + sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST_CODE_ERRORS": "0"}) + sdb.update_entry("FABRIC_PORT_TABLE", sdb_port, {"TEST": "product"}) + else: + print("The link ", port, " is down") + else: + print("We do not check switch type:", cfg_switch_type) + + +# Add Dummy always-pass test at end as workaroud +# for issue when Flaky fail on final test it invokes module tear-down before retrying +def test_nonflaky_dummy(): + pass + diff --git a/tests/test_fabric_port_isolation.py b/tests/test_fabric_port_isolation.py index d92cb73fe15..d1b57a019fe 100644 --- a/tests/test_fabric_port_isolation.py +++ b/tests/test_fabric_port_isolation.py @@ -29,9 +29,9 @@ def test_voq_switch_fabric_link(self, vst): portNum = random.randint(1, 16) port = "PORT"+str(portNum) # wait for link monitoring algorithm skips init pollings + sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST": "TEST"}) max_poll = PollingConfig(polling_interval=60, timeout=1200, strict=True) if sdb.get_entry("FABRIC_PORT_TABLE", port)['STATUS'] == 'up': - sdb.wait_for_field_match("FABRIC_PORT_TABLE", port, {"SKIP_FEC_ERR_ON_LNKUP_CNT": "2"}, polling_config=max_poll) try: # clean up the system for the testing port. # set TEST_CRC_ERRORS to 0 @@ -39,7 +39,6 @@ def test_voq_switch_fabric_link(self, vst): # set TEST to "TEST" sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST_CRC_ERRORS":"0"}) sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST_CODE_ERRORS": "0"}) - sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST": "TEST"}) # inject testing errors and wait for link get isolated. sdb.update_entry("FABRIC_PORT_TABLE", port, {"TEST_CRC_ERRORS": "2"}) sdb.wait_for_field_match("FABRIC_PORT_TABLE", port, {"AUTO_ISOLATED": "1"}, polling_config=max_poll)