Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions tests/pfcwd/files/pfc_detect_mellanox.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
-- KEYS - queue IDs
-- ARGV[1] - counters db index
-- ARGV[2] - counters table name
-- ARGV[3] - poll time interval (milliseconds)
-- return queue Ids that satisfy criteria

local counters_db = ARGV[1]
local counters_table_name = ARGV[2]
local poll_time = tonumber(ARGV[3]) * 1000

local rets = {}

redis.call('SELECT', counters_db)

-- Record the polling time
local timestamp_last = redis.call('HGET', 'TIMESTAMP', 'pfcwd_poll_timestamp_last')
local timestamp_struct = redis.call('TIME')
local timestamp_current = timestamp_struct[1] + timestamp_struct[2] / 1000000
local timestamp_string = tostring(timestamp_current)
redis.call('HSET', 'TIMESTAMP', 'pfcwd_poll_timestamp_last', timestamp_string)
local real_poll_time = poll_time
if timestamp_last ~= false then
real_poll_time = (timestamp_current - tonumber(timestamp_last)) * 1000000
end

-- Iterate through each queue
local n = table.getn(KEYS)
for i = n, 1, -1 do
local counter_keys = redis.call('HKEYS', counters_table_name .. ':' .. KEYS[i])
local counter_num = 0
local old_counter_num = 0
local is_deadlock = false
local pfc_wd_status = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_STATUS')
local pfc_wd_action = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_ACTION')

local big_red_switch_mode = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'BIG_RED_SWITCH_MODE')
if not big_red_switch_mode and (pfc_wd_status == 'operational' or pfc_wd_action == 'alert') then
local detection_time = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME')
if detection_time then
detection_time = tonumber(detection_time)
local time_left = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT')
if not time_left then
time_left = detection_time
else
time_left = tonumber(time_left)
end

local queue_index = redis.call('HGET', 'COUNTERS_QUEUE_INDEX_MAP', KEYS[i])
local port_id = redis.call('HGET', 'COUNTERS_QUEUE_PORT_MAP', KEYS[i])
-- If there is no entry in COUNTERS_QUEUE_INDEX_MAP or COUNTERS_QUEUE_PORT_MAP then
-- it means KEYS[i] queue is inserted into FLEX COUNTER DB but the corresponding
-- maps haven't been updated yet.
if queue_index and port_id then
local pfc_rx_pkt_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PKTS'
local pfc_duration_key = 'SAI_PORT_STAT_PFC_' .. queue_index .. '_RX_PAUSE_DURATION_US'

-- Get all counters
local occupancy_bytes = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_CURR_OCCUPANCY_BYTES')
local packets = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS')
local pfc_rx_packets = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key)
local pfc_duration = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key)

if occupancy_bytes and packets and pfc_rx_packets and pfc_duration then
occupancy_bytes = tonumber(occupancy_bytes)
packets = tonumber(packets)
pfc_rx_packets = tonumber(pfc_rx_packets)
pfc_duration = tonumber(pfc_duration)

local packets_last = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last')
local pfc_rx_packets_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last')
local pfc_duration_last = redis.call('HGET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last')
-- DEBUG CODE START. Uncomment to enable
local debug_storm = redis.call('HGET', counters_table_name .. ':' .. KEYS[i], 'DEBUG_STORM')
-- DEBUG CODE END.

-- If this is not a first run, then we have last values available
if packets_last and pfc_rx_packets_last and pfc_duration_last then
packets_last = tonumber(packets_last)
pfc_rx_packets_last = tonumber(pfc_rx_packets_last)
pfc_duration_last = tonumber(pfc_duration_last)
local storm_condition = (pfc_duration - pfc_duration_last) > (poll_time * 0.8)

-- Check actual condition of queue being in PFC storm
if (occupancy_bytes > 0 and packets - packets_last == 0 and pfc_rx_packets - pfc_rx_packets_last > 0) or
-- DEBUG CODE START. Uncomment to enable
(debug_storm == "enabled") or
-- DEBUG CODE END.
(occupancy_bytes == 0 and packets - packets_last == 0 and storm_condition) then
if time_left <= poll_time then
redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last')
redis.call('HDEL', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last')
local occupancy_string = '"occupancy","' .. tostring(occupancy_bytes) .. '",'
local packets_string = '"packets","' .. tostring(packets) .. '","packets_last","' .. tostring(packets_last) .. '",'
local pfc_rx_packets_string = '"pfc_rx_packets","' .. tostring(pfc_rx_packets) .. '","pfc_rx_packets_last","' .. tostring(pfc_rx_packets_last) .. '",'
local storm_condition_string = '"pfc_duration","' .. tostring(pfc_duration) .. '","pfc_duration_last","' .. tostring(pfc_duration_last) .. '",'
local timestamps = '"timestamp","' .. timestamp_string .. '","timestamp_last","' .. timestamp_last .. '","real_poll_time","' .. real_poll_time .. '"'
redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","storm",' .. occupancy_string .. packets_string .. pfc_rx_packets_string .. storm_condition_string .. timestamps .. ']')
is_deadlock = true
time_left = detection_time
else
time_left = time_left - poll_time
end
else
if pfc_wd_action == 'alert' and pfc_wd_status ~= 'operational' then
redis.call('PUBLISH', 'PFC_WD_ACTION', '["' .. KEYS[i] .. '","restore"]')
end
time_left = detection_time
end
end

-- Save values for next run
redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'SAI_QUEUE_STAT_PACKETS_last', packets)
redis.call('HSET', counters_table_name .. ':' .. KEYS[i], 'PFC_WD_DETECTION_TIME_LEFT', time_left)
if is_deadlock == false then
redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_rx_pkt_key .. '_last', pfc_rx_packets)
redis.call('HSET', counters_table_name .. ':' .. port_id, pfc_duration_key .. '_last', pfc_duration)
end
end
end
end
end
end

return rets
48 changes: 47 additions & 1 deletion tests/pfcwd/test_pfcwd_all_port_storm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@
from .files.pfcwd_helper import start_wd_on_ports, start_background_traffic # noqa F401
from .files.pfcwd_helper import EXPECT_PFC_WD_DETECT_RE, EXPECT_PFC_WD_RESTORE_RE, fetch_vendor_specific_diagnosis_re
from .files.pfcwd_helper import send_background_traffic
from tests.common import config_reload

TEMPLATES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "templates")
FILE_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "files")

pytestmark = [
pytest.mark.disable_loganalyzer,
Expand All @@ -26,6 +28,50 @@ def pfc_queue_idx():
yield 3 # Hardcoded in the testcase as well.


@pytest.fixture(scope='module')
def degrade_pfcwd_detection(duthosts, enum_rand_one_per_hwsku_frontend_hostname, fanouthosts):
"""
A fixture to degrade PFC Watchdog detection logic.
It's requried because leaf fanout switch can't generate enough PFC pause to trigger
PFC storm on all ports.
"""
duthost = duthosts[enum_rand_one_per_hwsku_frontend_hostname]
dut_asic_type = duthost.facts["asic_type"].lower()
skip_fixture = False
if dut_asic_type != "mellanox":
skip_fixture = True
# The workaround is not applicable for Mellanox leaf-fanout running ONYX or SONiC
# as we can leverage ASIC to generate PFC pause frames
for fanouthost in list(fanouthosts.values()):
fanout_os = fanouthost.get_fanout_os()
if fanout_os == 'onyx' or fanout_os == 'sonic' and fanouthost.facts['asic_type'] == "mellanox":
skip_fixture = True
break
if skip_fixture:
yield
return
logger.info("--- Degrade PFCWD detection logic --")
SRC_FILE = FILE_DIR + "/pfc_detect_mellanox.lua"
DST_FILE = "/usr/share/swss/pfc_detect_mellanox.lua"
# Backup original PFC Watchdog detection script
cmd = "docker exec -i swss cp {} {}.bak".format(DST_FILE, DST_FILE)
duthost.shell(cmd)
# Copy the new script to DUT
duthost.copy(src=SRC_FILE, dest='/tmp')
# Copy the new script to swss container
cmd = "docker cp /tmp/pfc_detect_mellanox.lua swss:{}".format(DST_FILE)
duthost.shell(cmd)
# Reload DUT to apply the new script
config_reload(duthost, safe_reload=True, check_intf_up_ports=True, wait_for_bgp=True)
yield
# Restore the original PFC Watchdog detection script
cmd = "docker exec -i swss cp {}.bak {}".format(DST_FILE, DST_FILE)
duthost.shell(cmd)
config_reload(duthost, safe_reload=True, check_intf_up_ports=True, wait_for_bgp=True)
# Cleanup
duthost.file(path='/tmp/pfc_detect_mellanox.lua', state='absent')


@pytest.fixture(scope='class', autouse=True)
def stop_pfcwd(duthosts, enum_rand_one_per_hwsku_frontend_hostname):
"""
Expand Down Expand Up @@ -120,7 +166,7 @@ def set_storm_params(duthost, fanout_graph, fanouthosts, peer_params):
return storm_hndle


@pytest.mark.usefixtures('stop_pfcwd', 'storm_test_setup_restore', 'start_background_traffic')
@pytest.mark.usefixtures('degrade_pfcwd_detection', 'stop_pfcwd', 'storm_test_setup_restore', 'start_background_traffic') # noqa E501
class TestPfcwdAllPortStorm(object):
""" PFC storm test class """
def run_test(self, duthost, storm_hndle, expect_regex, syslog_marker, action):
Expand Down