Skip to content

Commit f76e3f1

Browse files
Add PFC historical statistics estimation to the PFCWD Orch (#3533)
pfcwdorch has been updated to allow pfc historical statistic estimation. The orch allows the feature to be enabled or disabled and the pfc_detect_broadcom.lua script will perform the estimation and update counters_db for ports that tracking is enabled on. This implementation was made in accordance with comments from the community HLD review https://zoom.us/rec/share/jWkZRs51QULsDQvrSlm-5qC4OZ3gkG6RVdB2k_vqwgLcnezsaG1XtX5Aqk6xBYCZ.YMhHymrP4jbpkLbm HLD: sonic-net/SONiC#1903 What I did Edited the pfcwdorch and the pfc_detect_broadcom.lua to allow enable/disable-able PFC statistical history tracking. Why I did it Part of the PFC Historical Statistics Feature: sonic-net/SONiC#1904
1 parent f377682 commit f76e3f1

File tree

4 files changed

+214
-7
lines changed

4 files changed

+214
-7
lines changed

orchagent/pfc_detect_broadcom.lua

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,59 @@ local rets = {}
1212

1313
redis.call('SELECT', counters_db)
1414

15+
local function parse_boolean(str) return str == 'true' end
16+
local function parse_number(str) return tonumber(str) or 0 end
17+
18+
local function updateTimePaused(port_key, prio, time_since_last_poll)
19+
-- Estimate that queue paused for entire poll duration
20+
local total_pause_time_field = 'SAI_PORT_STAT_PFC_' .. prio .. '_RX_PAUSE_DURATION_US'
21+
local recent_pause_time_field = 'EST_PORT_STAT_PFC_' .. prio .. '_RECENT_PAUSE_TIME_US'
22+
23+
local recent_pause_time_us = parse_number(
24+
redis.call('HGET', port_key, recent_pause_time_field)
25+
)
26+
local total_pause_time_us = redis.call('HGET', port_key, total_pause_time_field)
27+
28+
-- Only estimate total time when no SAI support
29+
if not total_pause_time_us then
30+
total_pause_time_field = 'EST_PORT_STAT_PFC_' .. prio .. '_RX_PAUSE_DURATION_US'
31+
total_pause_time_us = parse_number(
32+
redis.call('HGET', port_key, total_pause_time_field)
33+
)
34+
35+
local total_pause_time_us_new = total_pause_time_us + time_since_last_poll
36+
redis.call('HSET', port_key, total_pause_time_field, total_pause_time_us_new)
37+
end
38+
39+
local recent_pause_time_us_new = recent_pause_time_us + time_since_last_poll
40+
redis.call('HSET', port_key, recent_pause_time_field, recent_pause_time_us_new)
41+
end
42+
43+
local function restartRecentTime(port_key, prio, timestamp_last)
44+
local recent_pause_time_field = 'EST_PORT_STAT_PFC_' .. prio .. '_RECENT_PAUSE_TIME_US'
45+
local recent_pause_timestamp_field = 'EST_PORT_STAT_PFC_' .. prio .. '_RECENT_PAUSE_TIMESTAMP'
46+
47+
redis.call('HSET', port_key, recent_pause_timestamp_field, timestamp_last)
48+
redis.call('HSET', port_key, recent_pause_time_field, 0)
49+
end
50+
51+
-- Get the time since the last poll, used to compute total and recent times
52+
local timestamp_field_last = 'PFCWD_POLL_TIMESTAMP_last'
53+
local timestamp_last = redis.call('HGET', 'TIMESTAMP', timestamp_field_last)
54+
local time = redis.call('TIME')
55+
-- convert to microseconds
56+
local timestamp_current = tonumber(time[1]) * 1000000 + tonumber(time[2])
57+
58+
-- save current poll as last poll
59+
local timestamp_string = tostring(timestamp_current)
60+
redis.call('HSET', 'TIMESTAMP', timestamp_field_last, timestamp_string)
61+
62+
local time_since_last_poll = poll_time
63+
-- not first poll
64+
if timestamp_last ~= false then
65+
time_since_last_poll = (timestamp_current - tonumber(timestamp_last))
66+
end
67+
1568
-- Iterate through each queue
1669
local n = table.getn(KEYS)
1770
for i = n, 1, -1 do
@@ -86,6 +139,29 @@ for i = n, 1, -1 do
86139
end
87140
time_left = detection_time
88141
end
142+
143+
-- estimate history
144+
local pfc_stat_history = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_STAT_HISTORY')
145+
if pfc_stat_history and pfc_stat_history == "enable" then
146+
local port_key = counters_table_name .. ':' .. port_id
147+
local was_paused = parse_boolean(queue_pause_status_last)
148+
local now_paused = parse_boolean(queue_pause_status)
149+
150+
-- Activity has occured
151+
if pfc_rx_packets > pfc_rx_packets_last then
152+
-- fresh recent pause period
153+
if not was_paused then
154+
restartRecentTime(port_key, queue_index, timestamp_last)
155+
end
156+
-- Estimate entire interval paused if there was pfc activity
157+
updateTimePaused(port_key, queue_index, time_since_last_poll)
158+
else
159+
-- queue paused entire interval without activity
160+
if now_paused and was_paused then
161+
updateTimePaused(port_key, queue_index, time_since_last_poll)
162+
end
163+
end
164+
end
89165
end
90166

91167
-- Save values for next run

orchagent/pfcwdorch.cpp

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#define PFC_WD_ACTION "action"
1616
#define PFC_WD_DETECTION_TIME "detection_time"
1717
#define PFC_WD_RESTORATION_TIME "restoration_time"
18+
#define PFC_STAT_HISTORY "pfc_stat_history"
1819
#define BIG_RED_SWITCH_FIELD "BIG_RED_SWITCH"
1920
#define PFC_WD_IN_STORM "storm"
2021

@@ -187,6 +188,7 @@ task_process_status PfcWdOrch<DropHandler, ForwardHandler>::createEntry(const st
187188
uint32_t restorationTime = 0;
188189
// According to requirements, drop action is default
189190
PfcWdAction action = PfcWdAction::PFC_WD_ACTION_DROP;
191+
string pfcStatHistory = "disable";
190192
Port port;
191193
if (!gPortsOrch->getPort(key, port))
192194
{
@@ -263,6 +265,9 @@ task_process_status PfcWdOrch<DropHandler, ForwardHandler>::createEntry(const st
263265
}
264266
}
265267
}
268+
else if(field == PFC_STAT_HISTORY){
269+
pfcStatHistory = value;
270+
}
266271
else
267272
{
268273
SWSS_LOG_ERROR(
@@ -297,8 +302,13 @@ task_process_status PfcWdOrch<DropHandler, ForwardHandler>::createEntry(const st
297302
SWSS_LOG_ERROR("%s missing", PFC_WD_DETECTION_TIME);
298303
return task_process_status::task_invalid_entry;
299304
}
305+
if (pfcStatHistory != "enable" && pfcStatHistory != "disable")
306+
{
307+
SWSS_LOG_ERROR("%s is invalid value for %s", pfcStatHistory.c_str(), PFC_STAT_HISTORY);
308+
return task_process_status::task_invalid_entry;
309+
}
300310

301-
if (!startWdOnPort(port, detectionTime, restorationTime, action))
311+
if (!startWdOnPort(port, detectionTime, restorationTime, action, pfcStatHistory))
302312
{
303313
SWSS_LOG_ERROR("Failed to start PFC Watchdog on port %s", port.m_alias.c_str());
304314
return task_process_status::task_need_retry;
@@ -516,7 +526,7 @@ void PfcWdSwOrch<DropHandler, ForwardHandler>::enableBigRedSwitchMode()
516526

517527
template <typename DropHandler, typename ForwardHandler>
518528
bool PfcWdSwOrch<DropHandler, ForwardHandler>::registerInWdDb(const Port& port,
519-
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action)
529+
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action, string pfcStatHistory)
520530
{
521531
SWSS_LOG_ENTER();
522532

@@ -564,6 +574,7 @@ bool PfcWdSwOrch<DropHandler, ForwardHandler>::registerInWdDb(const Port& port,
564574
"" :
565575
to_string(restorationTime * 1000));
566576
countersFieldValues.emplace_back("PFC_WD_ACTION", this->serializeAction(action));
577+
countersFieldValues.emplace_back("PFC_STAT_HISTORY", pfcStatHistory);
567578

568579
this->getCountersTable()->set(queueIdStr, countersFieldValues);
569580

@@ -747,11 +758,11 @@ PfcWdSwOrch<DropHandler, ForwardHandler>::PfcWdQueueEntry::PfcWdQueueEntry(
747758

748759
template <typename DropHandler, typename ForwardHandler>
749760
bool PfcWdSwOrch<DropHandler, ForwardHandler>::startWdOnPort(const Port& port,
750-
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action)
761+
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action, string pfcStatHistory)
751762
{
752763
SWSS_LOG_ENTER();
753764

754-
return registerInWdDb(port, detectionTime, restorationTime, action);
765+
return registerInWdDb(port, detectionTime, restorationTime, action, pfcStatHistory);
755766
}
756767

757768
template <typename DropHandler, typename ForwardHandler>

orchagent/pfcwdorch.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class PfcWdOrch: public Orch
4040

4141
virtual void doTask(Consumer& consumer);
4242
virtual bool startWdOnPort(const Port& port,
43-
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action) = 0;
43+
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action, string pfcStatHistory) = 0;
4444
virtual bool stopWdOnPort(const Port& port) = 0;
4545

4646
shared_ptr<Table> getCountersTable(void)
@@ -89,7 +89,7 @@ class PfcWdSwOrch: public PfcWdOrch<DropHandler, ForwardHandler>
8989

9090
void doTask(Consumer& consumer) override;
9191
virtual bool startWdOnPort(const Port& port,
92-
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action);
92+
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action, string pfcStatHistory);
9393
virtual bool stopWdOnPort(const Port& port);
9494

9595
task_process_status createEntry(const string& key, const vector<FieldValueTuple>& data) override;
@@ -121,7 +121,7 @@ class PfcWdSwOrch: public PfcWdOrch<DropHandler, ForwardHandler>
121121
template <typename T>
122122
static unordered_set<string> counterIdsToStr(const vector<T> ids, string (*convert)(T));
123123
bool registerInWdDb(const Port& port,
124-
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action);
124+
uint32_t detectionTime, uint32_t restorationTime, PfcWdAction action, string pfcStatHistory);
125125
void unregisterFromWdDb(const Port& port);
126126
void doTask(swss::NotificationConsumer &wdNotification);
127127

tests/mock_tests/portsorch_ut.cpp

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3429,6 +3429,126 @@ namespace portsorch_test
34293429
sai_port_api = orig_port_api;
34303430
}
34313431

3432+
/* This test verifies that an invalid configuration
3433+
* of pfc stat history will not enable the featuer
3434+
*/
3435+
TEST_F(PortsOrchTest, PfcInvalidHistoryToggle)
3436+
{
3437+
_hook_sai_switch_api();
3438+
// setup the tables with data
3439+
std::deque<KeyOpFieldsValuesTuple> entries;
3440+
Table portTable = Table(m_app_db.get(), APP_PORT_TABLE_NAME);
3441+
3442+
// Get SAI default ports to populate DB
3443+
auto ports = ut_helper::getInitialSaiPorts();
3444+
3445+
// Populate port table with SAI ports
3446+
for (const auto &it : ports)
3447+
{
3448+
portTable.set(it.first, it.second);
3449+
}
3450+
portTable.set("PortConfigDone", { { "count", to_string(ports.size()) } });
3451+
portTable.set("PortInitDone", { { "lanes", "0" } });
3452+
3453+
// refill consumer
3454+
gPortsOrch->addExistingData(&portTable);
3455+
3456+
// Apply configuration :
3457+
// create ports
3458+
static_cast<Orch *>(gPortsOrch)->doTask();
3459+
// Apply configuration
3460+
// ports
3461+
static_cast<Orch *>(gPortsOrch)->doTask();
3462+
3463+
ASSERT_TRUE(gPortsOrch->allPortsReady());
3464+
3465+
// No more tasks
3466+
vector<string> ts;
3467+
gPortsOrch->dumpPendingTasks(ts);
3468+
ASSERT_TRUE(ts.empty());
3469+
ts.clear();
3470+
3471+
entries.clear();
3472+
entries.push_back({ "Ethernet0", "SET", { { "pfc_enable", "3,4" }, { "pfcwd_sw_enable", "3,4" } } });
3473+
auto portQosMapConsumer = dynamic_cast<Consumer *>(gQosOrch->getExecutor(CFG_PORT_QOS_MAP_TABLE_NAME));
3474+
portQosMapConsumer->addToSync(entries);
3475+
entries.clear();
3476+
static_cast<Orch *>(gQosOrch)->doTask();
3477+
3478+
entries.push_back({ "GLOBAL", "SET", {{ "POLL_INTERVAL", "200" }}});
3479+
entries.push_back({ "Ethernet0", "SET", {
3480+
{ "action", "drop" },
3481+
{ "detection_time", "200" },
3482+
{ "restoration_time", "200" }
3483+
} });
3484+
auto PfcwdConsumer = dynamic_cast<Consumer *>(gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>->getExecutor(CFG_PFC_WD_TABLE_NAME));
3485+
PfcwdConsumer->addToSync(entries);
3486+
3487+
// trigger the notification
3488+
static_cast<Orch*>(gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>)->doTask();
3489+
ASSERT_EQ((gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>->m_pfcwd_ports.size()), 1);
3490+
entries.clear();
3491+
3492+
// create pfcwd entry with an invalid history setting
3493+
entries.push_back({ "Ethernet0", "SET", {
3494+
{ "action", "drop" },
3495+
{ "detection_time", "200" },
3496+
{ "restoration_time", "200" },
3497+
{ "pfc_stat_history", "up" }
3498+
} });
3499+
PfcwdConsumer->addToSync(entries);
3500+
static_cast<Orch*>(gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>)->doTask();
3501+
ASSERT_EQ((gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>->m_pfcwd_ports.size()), 1);
3502+
entries.clear();
3503+
3504+
// verify in counters db that history is NOT enabled
3505+
Port port;
3506+
gPortsOrch->getPort("Ethernet0", port);
3507+
auto countersTable = make_shared<Table>(m_counters_db.get(), COUNTERS_TABLE);
3508+
auto entryMap = gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>->m_entryMap;
3509+
3510+
sai_object_id_t queueId = port.m_queue_ids[3];
3511+
ASSERT_NE(entryMap.find(queueId), entryMap.end());
3512+
3513+
string queueIdStr = sai_serialize_object_id(queueId);
3514+
vector<FieldValueTuple> countersFieldValues;
3515+
countersTable->get(queueIdStr, countersFieldValues);
3516+
ASSERT_NE(countersFieldValues.size(), 0);
3517+
3518+
for (auto &valueTuple : countersFieldValues)
3519+
{
3520+
if (fvField(valueTuple) == "PFC_STAT_HISTORY")
3521+
{
3522+
ASSERT_TRUE(fvValue(valueTuple) == "disable");
3523+
}
3524+
}
3525+
3526+
queueId = port.m_queue_ids[4];
3527+
ASSERT_NE(entryMap.find(queueId), entryMap.end());
3528+
3529+
queueIdStr = sai_serialize_object_id(queueId);
3530+
countersFieldValues.clear();
3531+
countersTable->get(queueIdStr, countersFieldValues);
3532+
ASSERT_NE(countersFieldValues.size(), 0);
3533+
3534+
for (auto &valueTuple : countersFieldValues)
3535+
{
3536+
if (fvField(valueTuple) == "PFC_STAT_HISTORY")
3537+
{
3538+
ASSERT_TRUE(fvValue(valueTuple) == "disable");
3539+
}
3540+
}
3541+
3542+
// remove from monitoring
3543+
entries.push_back({ "Ethernet0", "DEL", { {} } });
3544+
PfcwdConsumer->addToSync(entries);
3545+
entries.clear();
3546+
static_cast<Orch *>(gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>)->doTask();
3547+
ASSERT_EQ((gPfcwdOrch<PfcWdDlrHandler, PfcWdDlrHandler>->m_pfcwd_ports.size()), 0);
3548+
3549+
_unhook_sai_switch_api();
3550+
}
3551+
34323552
/*
34333553
* The scope of this test is to verify that LAG member is
34343554
* added to a LAG before any other object on LAG is created, like RIF, bridge port in warm mode.

0 commit comments

Comments
 (0)