Skip to content

Commit 48ae866

Browse files
volodymyrsamotiylguohan
authored andcommitted
[pfcwd] Update PFC storm detection logic for Mellanox platforms (sonic-net#1586)
Use "PFC duration" counters in micro seconds instead of quanta SONiC PFCWD logic requires "pfc duration" value in micro seconds but in SAI it was provided as quanta of time. So it required additional conversion which used speed value to do such conversion and it could cause PFCWD to detect storm on operationally down port in case of link flapping. Now there are new SAI attributes that provide "pfc duration" in micro seconds so PCWD storm detection logic is updated in order to use this new "pfc duration" counters. Such algorithm change helps to avoid false PFC storm detection in case of link flapping because conversion is not needed anymore. Signed-off-by: Volodymyr Samotiy <[email protected]>
1 parent 850001f commit 48ae866

2 files changed

Lines changed: 45 additions & 25 deletions

File tree

orchagent/orchdaemon.cpp

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -333,10 +333,48 @@ bool OrchDaemon::init()
333333
CFG_PFC_WD_TABLE_NAME
334334
};
335335

336-
if ((platform == MLNX_PLATFORM_SUBSTRING)
337-
|| (platform == INVM_PLATFORM_SUBSTRING)
338-
|| (platform == BFN_PLATFORM_SUBSTRING)
339-
|| (platform == NPS_PLATFORM_SUBSTRING))
336+
if (platform == MLNX_PLATFORM_SUBSTRING)
337+
{
338+
339+
static const vector<sai_port_stat_t> portStatIds =
340+
{
341+
SAI_PORT_STAT_PFC_0_RX_PAUSE_DURATION_US,
342+
SAI_PORT_STAT_PFC_1_RX_PAUSE_DURATION_US,
343+
SAI_PORT_STAT_PFC_2_RX_PAUSE_DURATION_US,
344+
SAI_PORT_STAT_PFC_3_RX_PAUSE_DURATION_US,
345+
SAI_PORT_STAT_PFC_4_RX_PAUSE_DURATION_US,
346+
SAI_PORT_STAT_PFC_5_RX_PAUSE_DURATION_US,
347+
SAI_PORT_STAT_PFC_6_RX_PAUSE_DURATION_US,
348+
SAI_PORT_STAT_PFC_7_RX_PAUSE_DURATION_US,
349+
SAI_PORT_STAT_PFC_0_RX_PKTS,
350+
SAI_PORT_STAT_PFC_1_RX_PKTS,
351+
SAI_PORT_STAT_PFC_2_RX_PKTS,
352+
SAI_PORT_STAT_PFC_3_RX_PKTS,
353+
SAI_PORT_STAT_PFC_4_RX_PKTS,
354+
SAI_PORT_STAT_PFC_5_RX_PKTS,
355+
SAI_PORT_STAT_PFC_6_RX_PKTS,
356+
SAI_PORT_STAT_PFC_7_RX_PKTS,
357+
};
358+
359+
static const vector<sai_queue_stat_t> queueStatIds =
360+
{
361+
SAI_QUEUE_STAT_PACKETS,
362+
SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES,
363+
};
364+
365+
static const vector<sai_queue_attr_t> queueAttrIds;
366+
367+
m_orchList.push_back(new PfcWdSwOrch<PfcWdZeroBufferHandler, PfcWdLossyHandler>(
368+
m_configDb,
369+
pfc_wd_tables,
370+
portStatIds,
371+
queueStatIds,
372+
queueAttrIds,
373+
PFC_WD_POLL_MSECS));
374+
}
375+
else if ((platform == INVM_PLATFORM_SUBSTRING)
376+
|| (platform == BFN_PLATFORM_SUBSTRING)
377+
|| (platform == NPS_PLATFORM_SUBSTRING))
340378
{
341379

342380
static const vector<sai_port_stat_t> portStatIds =
@@ -367,9 +405,7 @@ bool OrchDaemon::init()
367405

368406
static const vector<sai_queue_attr_t> queueAttrIds;
369407

370-
if ((platform == MLNX_PLATFORM_SUBSTRING)
371-
|| (platform == INVM_PLATFORM_SUBSTRING)
372-
|| (platform == NPS_PLATFORM_SUBSTRING))
408+
if ((platform == INVM_PLATFORM_SUBSTRING) || (platform == NPS_PLATFORM_SUBSTRING))
373409
{
374410
m_orchList.push_back(new PfcWdSwOrch<PfcWdZeroBufferHandler, PfcWdLossyHandler>(
375411
m_configDb,

orchagent/pfc_detect_mellanox.lua

Lines changed: 2 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,26 +8,10 @@ local counters_db = ARGV[1]
88
local counters_table_name = ARGV[2]
99
local poll_time = tonumber(ARGV[3])
1010

11-
local asic_db = "1"
12-
local asic_db_port_table = "ASIC_STATE:SAI_OBJECT_TYPE_PORT"
13-
14-
local quanta_size = 512
15-
1611
local rets = {}
1712

1813
redis.call('SELECT', counters_db)
1914

20-
local function port_speed_get(port_id)
21-
redis.call('SELECT', asic_db)
22-
local port_speed = redis.call('HGET', asic_db_port_table .. ':' .. port_id, 'SAI_PORT_ATTR_SPEED')
23-
redis.call('SELECT', counters_db)
24-
return tonumber(port_speed)
25-
end
26-
27-
local function quantatous(quanta, port_id)
28-
return quanta * quanta_size / port_speed_get(port_id)
29-
end
30-
3115
-- Iterate through each queue
3216
local n = table.getn(KEYS)
3317
for i = n, 1, -1 do
@@ -53,7 +37,7 @@ for i = n, 1, -1 do
5337
local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i])
5438
local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i])
5539
local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS'
56-
local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION'
40+
local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US'
5741

5842
-- Get all counters
5943
local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES')
@@ -79,7 +63,7 @@ for i = n, 1, -1 do
7963
packets_last = tonumber(packets_last)
8064
pfc_rx_packets_last = tonumber(pfc_rx_packets_last)
8165
pfc_duration_last = tonumber(pfc_duration_last)
82-
local storm_condition = ((quantatous(pfc_duration, port_id) - quantatous(pfc_duration_last, port_id)) > poll_time * 0.8)
66+
local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8)
8367

8468
-- Check actual condition of queue being in PFC storm
8569
if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or

0 commit comments

Comments
 (0)