-
Notifications
You must be signed in to change notification settings - Fork 692
[PFCWD]: Add alert mode broadcom support #343
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| -- KEYS - queue IDs | ||
| -- ARGV[1] - counters db index | ||
| -- ARGV[2] - counters table name | ||
| -- ARGV[3] - poll time interval | ||
| -- return queue Ids that satisfy criteria | ||
|
|
||
| local counters_db = ARGV[1] | ||
| local counters_table_name = ARGV[2] | ||
| local poll_time = tonumber(ARGV[3]) | ||
|
|
||
| local rets = {} | ||
|
|
||
| redis.call('SELECT', counters_db) | ||
|
|
||
| -- Iterate through each queue | ||
| local n = table.getn(KEYS) | ||
| for i = n, 1, -1 do | ||
| local counter_keys = redis.call('HKEYS', counters_table_name .. ':' .. KEYS[i]) | ||
| local counter_num = 0 | ||
| local old_counter_num = 0 | ||
| local is_deadlock = false | ||
| local pfc_wd_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_STATUS') | ||
| local pfc_wd_action = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_ACTION') | ||
| if pfc_wd_status == 'operational' or pfc_wd_action == 'alert' then | ||
| local detection_time = tonumber(redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME')) | ||
| local time_left = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT') | ||
| if not time_left then | ||
| time_left = detection_time | ||
| else | ||
| time_left = tonumber(time_left) | ||
| end | ||
|
|
||
| local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i]) | ||
| local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i]) | ||
| local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS' | ||
| local pfc_on2off_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_ON2OFF_RX_PKTS' | ||
|
|
||
|
|
||
| -- Get all counters | ||
| local occupancy_bytes = tonumber(redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES')) | ||
| local packets = tonumber(redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS')) | ||
| local pfc_rx_packets = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)) | ||
| local pfc_on2off = tonumber(redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key)) | ||
| local queue_pause_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS') | ||
|
|
||
| local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last') | ||
| local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last') | ||
| local pfc_on2off_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last') | ||
| local queue_pause_status_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last') | ||
|
|
||
| -- DEBUG CODE START. Uncomment to enable | ||
| local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM') | ||
| -- DEBUG CODE END. | ||
|
|
||
| -- If this is not a first run, then we have last values available | ||
| if packets_last and pfc_rx_packets_last and pfc_on2off_last and queue_pause_status_last then | ||
| packets_last = tonumber(packets_last) | ||
| pfc_rx_packets_last = tonumber(pfc_rx_packets_last) | ||
| pfc_on2off_last = tonumber(pfc_on2off_last) | ||
|
|
||
| -- Check actual condition of queue being in PFC storm | ||
| if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or | ||
| -- DEBUG CODE START. Uncomment to enable | ||
| (debug_storm == "enabled") or | ||
| -- DEBUG CODE END. | ||
| (occupancy_bytes == 0 and pfc_rx_packets - pfc_rx_packets_last > 0 and pfc_on2off - pfc_on2off_last == 0 and queue_pause_status_last == 'true' and queue_pause_status == 'true') then | ||
| if time_left <= poll_time then | ||
| redis.call('PUBLISH', 'PFC_WD', '["' .. KEYS[i] .. '","storm"]') | ||
| is_deadlock = true | ||
| time_left = detection_time | ||
| else | ||
| time_left = time_left - poll_time | ||
| end | ||
| else | ||
| if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Plugins do not distinguish between actions. Two types of notifications are "storm" and "restore". Orchagent decides what to do based on configuration.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently, the detect plugin checks queues in operational status, and restore plugin checks queues in stormed status. This is not true for the alert mode. In alert mode, the detect plugin will check all the queues regardless of the queue status, and restore plugin never check anything. Currently the logic of when/which queue to check is included in the plugin itself. That's the reason why plugin also needs to know the action type to decide which queue to check for alert-mode and normal case. In #342, it still runs restore plugins for stormed queues in alert mode, which is not correct.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the document, definition of alert mode is following: So as I understand, alert mode is same as other actions, except it doesn't apply any configuration, but only logs events. @lguohan can you please take a look? Maybe I'm missing something?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The restore plugin criteria is that the stormed queue receives no more PFC. This criteria actually bases on the fact that we have taken the recovery action. After the recovery action, no more PFC should be received if the queue has recovered. In alert mode however, we don't take any action but log the events. In such case, restore criteria is too strong, as it's OK for the queue to occasionally receive PFC when no recovery action has been taken. Detection plugin is the criteria to determine whether the queue is stormed when no recovery action has been taken. So in alert mode, we would use detection plugin to determine whether the queue storm is stopped.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
| redis.call('PUBLISH', 'PFC_WD', '["' .. KEYS[i] .. '","restore"]') | ||
| end | ||
| time_left = detection_time | ||
| end | ||
| end | ||
|
|
||
| -- Save values for next run | ||
| redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_ATTR_PAUSE_STATUS_last', queue_pause_status) | ||
| redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets) | ||
| redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left) | ||
| redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets) | ||
| redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_on2off_key .. '_last', pfc_on2off) | ||
| end | ||
| end | ||
|
|
||
| return rets | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -108,6 +108,27 @@ PfcWdAction PfcWdOrch<DropHandler, ForwardHandler>::deserializeAction(const stri | |
| return actionMap.at(key); | ||
| } | ||
|
|
||
| template <typename DropHandler, typename ForwardHandler> | ||
| string PfcWdOrch<DropHandler, ForwardHandler>::serializeAction(const PfcWdAction& action) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Plugins don't have to know which action type it is. It's orchagent's responsibility to make such decision.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As above comments, detect plugin need to detect stormed queues in alert mode instead of restore plugin. This logic need to be added into lua script based on current implementation. |
||
| { | ||
| SWSS_LOG_ENTER(); | ||
|
|
||
| const map<PfcWdAction, string> actionMap = | ||
| { | ||
| { PfcWdAction::PFC_WD_ACTION_FORWARD, "forward" }, | ||
| { PfcWdAction::PFC_WD_ACTION_DROP, "drop" }, | ||
| { PfcWdAction::PFC_WD_ACTION_ALERT, "alert" }, | ||
| }; | ||
|
|
||
| if (actionMap.find(action) == actionMap.end()) | ||
| { | ||
| return "unknown"; | ||
| } | ||
|
|
||
| return actionMap.at(action); | ||
| } | ||
|
|
||
|
|
||
| template <typename DropHandler, typename ForwardHandler> | ||
| void PfcWdOrch<DropHandler, ForwardHandler>::createEntry(const string& key, | ||
| const vector<FieldValueTuple>& data) | ||
|
|
@@ -270,6 +291,8 @@ void PfcWdSwOrch<DropHandler, ForwardHandler>::registerInWdDb(const Port& port, | |
| vector<FieldValueTuple> countersFieldValues; | ||
| countersFieldValues.emplace_back("PFC_WD_DETECTION_TIME", to_string(detectionTime * 1000)); | ||
| countersFieldValues.emplace_back("PFC_WD_RESTORATION_TIME", to_string(restorationTime * 1000)); | ||
| countersFieldValues.emplace_back("PFC_WD_ACTION", this->serializeAction(action)); | ||
|
|
||
| PfcWdOrch<DropHandler, ForwardHandler>::getCountersTable()->set(queueIdStr, countersFieldValues); | ||
|
|
||
| // We register our queues in PFC_WD table so that syncd will know that it must poll them | ||
|
|
@@ -281,6 +304,12 @@ void PfcWdSwOrch<DropHandler, ForwardHandler>::registerInWdDb(const Port& port, | |
| queueFieldValues.emplace_back(PFC_WD_QUEUE_COUNTER_ID_LIST, str); | ||
| } | ||
|
|
||
| if (!c_queueAttrIds.empty()) | ||
| { | ||
| string str = counterIdsToStr(c_queueAttrIds, sai_serialize_queue_attr); | ||
| queueFieldValues.emplace_back(PFC_WD_QUEUE_ATTR_ID_LIST, str); | ||
| } | ||
|
|
||
| // Create internal entry | ||
| m_entryMap.emplace(queueId, PfcWdQueueEntry(action, port.m_port_id, i)); | ||
|
|
||
|
|
@@ -313,18 +342,19 @@ PfcWdSwOrch<DropHandler, ForwardHandler>::PfcWdSwOrch( | |
| DBConnector *db, | ||
| vector<string> &tableNames, | ||
| vector<sai_port_stat_t> portStatIds, | ||
| vector<sai_queue_stat_t> queueStatIds): | ||
| vector<sai_queue_stat_t> queueStatIds, | ||
| vector<sai_queue_attr_t> queueAttrIds): | ||
| PfcWdOrch<DropHandler, | ||
| ForwardHandler>(db, tableNames), | ||
| m_pfcWdDb(new DBConnector(PFC_WD_DB, DBConnector::DEFAULT_UNIXSOCKET, 0)), | ||
| m_pfcWdTable(new ProducerStateTable(m_pfcWdDb.get(), PFC_WD_STATE_TABLE)), | ||
| c_portStatIds(portStatIds), | ||
| c_queueStatIds(queueStatIds) | ||
| c_queueStatIds(queueStatIds), | ||
| c_queueAttrIds(queueAttrIds) | ||
| { | ||
| SWSS_LOG_ENTER(); | ||
|
|
||
| string platform = getenv("platform") ? getenv("platform") : ""; | ||
|
|
||
| if (platform == "") | ||
| { | ||
| SWSS_LOG_ERROR("Platform environment variable is not defined"); | ||
|
|
@@ -333,7 +363,7 @@ PfcWdSwOrch<DropHandler, ForwardHandler>::PfcWdSwOrch( | |
|
|
||
| string detectSha, restoreSha; | ||
| string detectPluginName = "pfc_detect_" + platform + ".lua"; | ||
| string restorePluginName = "pfc_restore_" + platform + ".lua"; | ||
| string restorePluginName = "pfc_restore.lua"; | ||
|
|
||
| try | ||
| { | ||
|
|
@@ -428,9 +458,21 @@ void PfcWdSwOrch<DropHandler, ForwardHandler>::handleWdNotification(swss::Notifi | |
| return; | ||
| } | ||
|
|
||
| SWSS_LOG_NOTICE("Receive notification, %s", event.c_str()); | ||
| if (event == "storm") | ||
| { | ||
| if (entry->second.action == PfcWdAction::PFC_WD_ACTION_DROP) | ||
| if (entry->second.action == PfcWdAction::PFC_WD_ACTION_ALERT) | ||
| { | ||
| if (entry->second.handler == nullptr) | ||
| { | ||
| entry->second.handler = make_shared<PfcWdActionHandler>( | ||
| entry->second.portId, | ||
| entry->first, | ||
| entry->second.index, | ||
| PfcWdOrch<DropHandler, ForwardHandler>::getCountersTable()); | ||
| } | ||
| } | ||
| else if (entry->second.action == PfcWdAction::PFC_WD_ACTION_DROP) | ||
| { | ||
| entry->second.handler = make_shared<DropHandler>( | ||
| entry->second.portId, | ||
|
|
@@ -446,14 +488,6 @@ void PfcWdSwOrch<DropHandler, ForwardHandler>::handleWdNotification(swss::Notifi | |
| entry->second.index, | ||
| PfcWdOrch<DropHandler, ForwardHandler>::getCountersTable()); | ||
| } | ||
| else if (entry->second.action == PfcWdAction::PFC_WD_ACTION_ALERT) | ||
| { | ||
| entry->second.handler = make_shared<PfcWdActionHandler>( | ||
| entry->second.portId, | ||
| entry->first, | ||
| entry->second.index, | ||
| PfcWdOrch<DropHandler, ForwardHandler>::getCountersTable()); | ||
| } | ||
| else | ||
| { | ||
| throw runtime_error("Unknown PFC WD action"); | ||
|
|
@@ -465,7 +499,7 @@ void PfcWdSwOrch<DropHandler, ForwardHandler>::handleWdNotification(swss::Notifi | |
| } | ||
| else | ||
| { | ||
| SWSS_LOG_ERROR("Received unknown event from plugin"); | ||
| SWSS_LOG_ERROR("Received unknown event from plugin, %s", event.c_str()); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -551,3 +585,4 @@ void PfcWdSwOrch<DropHandler, ForwardHandler>::endWatchdogThread(void) | |
|
|
||
| // Trick to keep member functions in a separate file | ||
| template class PfcWdSwOrch<PfcWdZeroBufferHandler, PfcWdLossyHandler>; | ||
| template class PfcWdSwOrch<PfcWdActionHandler, PfcWdActionHandler>; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,9 +43,10 @@ class PfcWdOrch: public Orch | |
| return m_countersDb; | ||
| } | ||
|
|
||
| private: | ||
| static PfcWdAction deserializeAction(const string& key); | ||
| void createEntry(const string& key, const vector<FieldValueTuple>& data); | ||
| static string serializeAction(const PfcWdAction &action); | ||
| private: | ||
| void createEntry(const string& key, const vector<FieldValueTuple>& data); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. four spaces |
||
| void deleteEntry(const string& name); | ||
|
|
||
| shared_ptr<DBConnector> m_countersDb = nullptr; | ||
|
|
@@ -60,7 +61,8 @@ class PfcWdSwOrch: public PfcWdOrch<DropHandler, ForwardHandler> | |
| DBConnector *db, | ||
| vector<string> &tableNames, | ||
| vector<sai_port_stat_t> portStatIds, | ||
| vector<sai_queue_stat_t> queueStatIds); | ||
| vector<sai_queue_stat_t> queueStatIds, | ||
| vector<sai_queue_attr_t> queueAttrIds); | ||
| virtual ~PfcWdSwOrch(void); | ||
|
|
||
| virtual bool startWdOnPort(const Port& port, | ||
|
|
@@ -97,6 +99,7 @@ class PfcWdSwOrch: public PfcWdOrch<DropHandler, ForwardHandler> | |
|
|
||
| const vector<sai_port_stat_t> c_portStatIds; | ||
| const vector<sai_queue_stat_t> c_queueStatIds; | ||
| const vector<sai_queue_attr_t> c_queueAttrIds; | ||
|
|
||
| shared_ptr<DBConnector> m_pfcWdDb = nullptr; | ||
| shared_ptr<ProducerStateTable> m_pfcWdTable = nullptr; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can reuse forward handler
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently both drop and forward uses base class for simplicity. I agree we might reuse forward handler. This PR is for alert mode. So I would prefer to update drop/forward handler together in a separate PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok