Skip to content

Commit 56be8ad

Browse files
bingwang-msvikshaw-Nokia
authored andcommitted
Degrade PFC storm detection in PFCWD all ports storm test (sonic-net#13976)
* Degrade PFC storm detection logic in pfcwd_all_port_storm test
1 parent a87b33a commit 56be8ad

2 files changed

Lines changed: 171 additions & 1 deletion

File tree

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
-- KEYS - queue IDs
2+
-- ARGV[1] - counters db index
3+
-- ARGV[2] - counters table name
4+
-- ARGV[3] - poll time interval (milliseconds)
5+
-- return queue Ids that satisfy criteria
6+
7+
local counters_db = ARGV[1]
8+
local counters_table_name = ARGV[2]
9+
local poll_time = tonumber(ARGV[3]) * 1000
10+
11+
local rets = {}
12+
13+
redis.call('SELECT', counters_db)
14+
15+
-- Record the polling time
16+
local timestamp_last = redis.call('HGET', 'TIMESTAMP', 'pfcwd_poll_timestamp_last')
17+
local timestamp_struct = redis.call('TIME')
18+
local timestamp_current = timestamp_struct[1] + timestamp_struct[2] / 1000000
19+
local timestamp_string = tostring(timestamp_current)
20+
redis.call('HSET', 'TIMESTAMP', 'pfcwd_poll_timestamp_last', timestamp_string)
21+
local real_poll_time = poll_time
22+
if timestamp_last ~= false then
23+
real_poll_time = (timestamp_current - tonumber(timestamp_last)) * 1000000
24+
end
25+
26+
-- Iterate through each queue
27+
local n = table.getn(KEYS)
28+
for i = n, 1, -1 do
29+
local counter_keys = redis.call('HKEYS', counters_table_name .. ':' .. KEYS[i])
30+
local counter_num = 0
31+
local old_counter_num = 0
32+
local is_deadlock = false
33+
local pfc_wd_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_STATUS')
34+
local pfc_wd_action = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_ACTION')
35+
36+
local big_red_switch_mode = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'BIG_RED_SWITCH_MODE')
37+
if not big_red_switch_mode and (pfc_wd_status == 'operational' or pfc_wd_action == 'alert') then
38+
local detection_time = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME')
39+
if detection_time then
40+
detection_time = tonumber(detection_time)
41+
local time_left = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT')
42+
if not time_left then
43+
time_left = detection_time
44+
else
45+
time_left = tonumber(time_left)
46+
end
47+
48+
local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i])
49+
local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i])
50+
-- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then
51+
-- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding
52+
-- maps haven't been updated yet.
53+
if queue_index and port_id then
54+
local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS'
55+
local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US'
56+
57+
-- Get all counters
58+
local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES')
59+
local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS')
60+
local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)
61+
local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key)
62+
63+
if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then
64+
occupancy_bytes = tonumber(occupancy_bytes)
65+
packets = tonumber(packets)
66+
pfc_rx_packets = tonumber(pfc_rx_packets)
67+
pfc_duration = tonumber(pfc_duration)
68+
69+
local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last')
70+
local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last')
71+
local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last')
72+
-- DEBUG CODE START. Uncomment to enable
73+
local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM')
74+
-- DEBUG CODE END.
75+
76+
-- If this is not a first run, then we have last values available
77+
if packets_last and pfc_rx_packets_last and pfc_duration_last then
78+
packets_last = tonumber(packets_last)
79+
pfc_rx_packets_last = tonumber(pfc_rx_packets_last)
80+
pfc_duration_last = tonumber(pfc_duration_last)
81+
local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8)
82+
83+
-- Check actual condition of queue being in PFC storm
84+
if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or
85+
-- DEBUG CODE START. Uncomment to enable
86+
(debug_storm == "enabled") or
87+
-- DEBUG CODE END.
88+
(occupancy_bytes == 0 and packets - packets_last == 0 and storm_condition) then
89+
if time_left <= poll_time then
90+
redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last')
91+
redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last')
92+
local occupancy_string = '"occupancy","' .. tostring(occupancy_bytes) .. '",'
93+
local packets_string = '"packets","' .. tostring(packets) .. '","packets_last","' .. tostring(packets_last) .. '",'
94+
local pfc_rx_packets_string = '"pfc_rx_packets","' .. tostring(pfc_rx_packets) .. '","pfc_rx_packets_last","' .. tostring(pfc_rx_packets_last) .. '",'
95+
local storm_condition_string = '"pfc_duration","' .. tostring(pfc_duration) .. '","pfc_duration_last","' .. tostring(pfc_duration_last) .. '",'
96+
local timestamps = '"timestamp","' .. timestamp_string .. '","timestamp_last","' .. timestamp_last .. '","real_poll_time","' .. real_poll_time .. '"'
97+
redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm",' .. occupancy_string .. packets_string .. pfc_rx_packets_string .. storm_condition_string .. timestamps .. ']')
98+
is_deadlock = true
99+
time_left = detection_time
100+
else
101+
time_left = time_left - poll_time
102+
end
103+
else
104+
if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then
105+
redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]')
106+
end
107+
time_left = detection_time
108+
end
109+
end
110+
111+
-- Save values for next run
112+
redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets)
113+
redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left)
114+
if is_deadlock == false then
115+
redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets)
116+
redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration)
117+
end
118+
end
119+
end
120+
end
121+
end
122+
end
123+
124+
return rets

tests/pfcwd/test_pfcwd_all_port_storm.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@
99
from .files.pfcwd_helper import start_wd_on_ports, start_background_traffic # noqa F401
1010
from .files.pfcwd_helper import EXPECT_PFC_WD_DETECT_RE, EXPECT_PFC_WD_RESTORE_RE, fetch_vendor_specific_diagnosis_re
1111
from .files.pfcwd_helper import send_background_traffic
12+
from tests.common import config_reload
1213

1314
TEMPLATES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
15+
FILE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "files")
1416

1517
pytestmark = [
1618
pytest.mark.disable_loganalyzer,
@@ -26,6 +28,50 @@ def pfc_queue_idx():
2628
yield 3 # Hardcoded in the testcase as well.
2729

2830

31+
@pytest.fixture(scope='module')
32+
def degrade_pfcwd_detection(duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts):
33+
"""
34+
A fixture to degrade PFC Watchdog detection logic.
35+
It's requried because leaf fanout switch can't generate enough PFC pause to trigger
36+
PFC storm on all ports.
37+
"""
38+
duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
39+
dut_asic_type = duthost.facts["asic_type"].lower()
40+
skip_fixture = False
41+
if dut_asic_type != "mellanox":
42+
skip_fixture = True
43+
# The workaround is not applicable for Mellanox leaf-fanout running ONYX or SONiC
44+
# as we can leverage ASIC to generate PFC pause frames
45+
for fanouthost in list(fanouthosts.values()):
46+
fanout_os = fanouthost.get_fanout_os()
47+
if fanout_os == 'onyx' or fanout_os == 'sonic' and fanouthost.facts['asic_type'] == "mellanox":
48+
skip_fixture = True
49+
break
50+
if skip_fixture:
51+
yield
52+
return
53+
logger.info("--- Degrade PFCWD detection logic --")
54+
SRC_FILE = FILE_DIR + "/pfc_detect_mellanox.lua"
55+
DST_FILE = "/usr/share/swss/pfc_detect_mellanox.lua"
56+
# Backup original PFC Watchdog detection script
57+
cmd = "docker exec -i swss cp {} {}.bak".format(DST_FILE, DST_FILE)
58+
duthost.shell(cmd)
59+
# Copy the new script to DUT
60+
duthost.copy(src=SRC_FILE, dest='/tmp')
61+
# Copy the new script to swss container
62+
cmd = "docker cp /tmp/pfc_detect_mellanox.lua swss:{}".format(DST_FILE)
63+
duthost.shell(cmd)
64+
# Reload DUT to apply the new script
65+
config_reload(duthost, safe_reload=True, check_intf_up_ports=True, wait_for_bgp=True)
66+
yield
67+
# Restore the original PFC Watchdog detection script
68+
cmd = "docker exec -i swss cp {}.bak {}".format(DST_FILE, DST_FILE)
69+
duthost.shell(cmd)
70+
config_reload(duthost, safe_reload=True, check_intf_up_ports=True, wait_for_bgp=True)
71+
# Cleanup
72+
duthost.file(path='/tmp/pfc_detect_mellanox.lua', state='absent')
73+
74+
2975
@pytest.fixture(scope='class', autouse=True)
3076
def stop_pfcwd(duthosts, enum_rand_one_per_hwsku_frontend_hostname):
3177
"""
@@ -120,7 +166,7 @@ def set_storm_params(duthost, fanout_graph, fanouthosts, peer_params):
120166
return storm_hndle
121167

122168

123-
@pytest.mark.usefixtures('stop_pfcwd', 'storm_test_setup_restore', 'start_background_traffic')
169+
@pytest.mark.usefixtures('degrade_pfcwd_detection', 'stop_pfcwd', 'storm_test_setup_restore', 'start_background_traffic') # noqa E501
124170
class TestPfcwdAllPortStorm(object):
125171
""" PFC storm test class """
126172
def run_test(self, duthost, storm_hndle, expect_regex, syslog_marker, action):

0 commit comments

Comments
 (0)