Skip to content

Commit 13a0cb2

Browse files
authored
Add check_pfc_storm_active() to fast-reboot script (sonic-net#3969)
1 parent f45d896 commit 13a0cb2

3 files changed

Lines changed: 132 additions & 3 deletions

File tree

pfcwd/main.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,15 @@ def collect_stats(self, empty, queues):
163163

164164
self.table += table
165165

166-
def show_stats(self, empty, queues):
166+
def show_stats(self, empty, queues, check_storm=False):
167167
del self.table[:]
168168
self.collect_stats(empty, queues)
169+
170+
if check_storm:
171+
# Check for storms and exit accordingly - no output needed
172+
storms_detected = any(row[1] == 'stormed' for row in self.table if len(row) > 1)
173+
sys.exit(1 if storms_detected else 0)
174+
169175
click.echo(tabulate(
170176
self.table, STATS_HEADER, stralign='right', numalign='right',
171177
tablefmt='simple'
@@ -468,13 +474,14 @@ def show():
468474
@show.command()
469475
@multi_asic_util.multi_asic_click_options
470476
@click.option('-e', '--empty', is_flag=True)
477+
@click.option('--check-storm', is_flag=True, help='Exit 1 if any storms detected, 0 otherwise')
471478
@click.argument('queues', nargs=-1)
472479
@clicommon.pass_db
473-
def stats(db, namespace, display, empty, queues):
480+
def stats(db, namespace, display, empty, check_storm, queues):
474481
""" Show PFC Watchdog stats per queue """
475482
if (len(queues)):
476483
display = constants.DISPLAY_ALL
477-
PfcwdCli(db, namespace, display).show_stats(empty, queues)
484+
PfcwdCli(db, namespace, display).show_stats(empty, queues, check_storm)
478485

479486
# Show config
480487
@show.command()

scripts/fast-reboot

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21
5151
EXIT_PLATFORM_FW_AU_FAILURE=22
5252
EXIT_TEAMD_RETRY_COUNT_FAILURE=23
5353
EXIT_NO_MIRROR_SESSION_ACLS=24
54+
EXIT_PFC_STORM_DETECTED=25
5455
EXIT_LEFTOVER_CPA_TUNNEL=30
5556

5657
function error()
@@ -514,6 +515,18 @@ function check_docker_exec()
514515
done
515516
}
516517
518+
function check_pfc_storm_active()
519+
{
520+
debug "Checking for active PFC storms..."
521+
522+
if pfcwd show stats --check-storm >/dev/null 2>&1; then
523+
debug "No active PFC storms detected. Safe to proceed with warm-reboot..."
524+
else
525+
error "PFC storm detected. Aborting warm-reboot to prevent failure in recovery path..."
526+
exit ${EXIT_PFC_STORM_DETECTED}
527+
fi
528+
}
529+
517530
function check_db_integrity()
518531
{
519532
if [[ "$REBOOT_TYPE" = "warm-reboot" || "$REBOOT_TYPE" = "fastfast-reboot" || "$REBOOT_TYPE" = "express-reboot" || "$REBOOT_TYPE" = "fast-reboot" ]]; then
@@ -533,6 +546,7 @@ function check_db_integrity()
533546
534547
function reboot_pre_check()
535548
{
549+
check_pfc_storm_active
536550
check_docker_exec
537551
# Make sure that the file system is normal: read-write able
538552
filename="/host/test-$(date +%Y%m%d-%H%M%S)"

tests/pfcwd_test.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,114 @@ def test_pfcwd_enable_history_ports_invalid(self):
406406
# same as original config
407407
assert result.output == test_vectors.pfcwd_show_config_output
408408

409+
def test_pfcwd_show_stats_check_storm_no_storms(self):
410+
""" Test --check-storm flag when no storms are present """
411+
import pfcwd.main as pfcwd
412+
from unittest.mock import patch
413+
runner = CliRunner()
414+
db = Db()
415+
416+
# Mock the collect_stats method to simulate no storms
417+
def mock_collect_stats_no_storm(self, empty, queues, storm_only=False):
418+
# Create fake table data with only operational queues
419+
self.table = [
420+
['Ethernet0:3', 'operational', '2/2', '100/100', '100/100', '0/0', '0/0'],
421+
['Ethernet4:3', 'operational', '3/3', '150/150', '150/150', '0/0', '0/0'],
422+
['Ethernet8:4', 'operational', '1/1', '50/50', '50/50', '0/0', '0/0']
423+
]
424+
425+
with patch.object(pfcwd.PfcwdCli, 'collect_stats', mock_collect_stats_no_storm):
426+
# Test with no storms - should exit 0
427+
result = runner.invoke(
428+
pfcwd.cli.commands["show"].commands["stats"],
429+
["--check-storm"],
430+
obj=db
431+
)
432+
print("No storms test - exit code:", result.exit_code)
433+
print("No storms test - output:", result.output)
434+
assert result.exit_code == 0
435+
assert result.output == "" # Should be silent when checking storms
436+
437+
def test_pfcwd_show_stats_check_storm_with_storms(self):
438+
""" Test --check-storm flag when storms are present """
439+
import pfcwd.main as pfcwd
440+
from unittest.mock import patch
441+
runner = CliRunner()
442+
db = Db()
443+
444+
# Mock the collect_stats method to simulate storm detection
445+
def mock_collect_stats_with_storm(self, empty, queues, storm_only=False):
446+
# Create fake table data with a stormed queue
447+
self.table = [
448+
['Ethernet0:3', 'stormed', '1/0', '100/300', '100/300', '0/200', '0/200']
449+
]
450+
451+
with patch.object(pfcwd.PfcwdCli, 'collect_stats', mock_collect_stats_with_storm):
452+
result = runner.invoke(
453+
pfcwd.cli.commands["show"].commands["stats"],
454+
["--check-storm"],
455+
obj=db
456+
)
457+
print("With storms test - exit code:", result.exit_code)
458+
print("With storms test - output:", result.output)
459+
assert result.exit_code == 1
460+
assert result.output == "" # Should be silent when checking storms
461+
462+
def test_pfcwd_show_stats_check_storm_mixed_status(self):
463+
""" Test --check-storm flag with mixed operational and stormed queues """
464+
import pfcwd.main as pfcwd
465+
from unittest.mock import patch
466+
runner = CliRunner()
467+
db = Db()
468+
469+
# Mock the collect_stats method to simulate mixed queue states
470+
def mock_collect_stats_mixed(self, empty, queues, storm_only=False):
471+
# Create fake table data with mixed states - should still exit 1 if ANY storm detected
472+
self.table = [
473+
['Ethernet0:3', 'operational', '2/2', '100/100', '100/100', '0/0', '0/0'],
474+
['Ethernet4:3', 'stormed', '1/0', '100/300', '100/300', '0/200', '0/200'],
475+
['Ethernet8:4', 'operational', '0/0', '50/50', '50/50', '0/0', '0/0']
476+
]
477+
478+
with patch.object(pfcwd.PfcwdCli, 'collect_stats', mock_collect_stats_mixed):
479+
result = runner.invoke(
480+
pfcwd.cli.commands["show"].commands["stats"],
481+
["--check-storm"],
482+
obj=db
483+
)
484+
print("Mixed states test - exit code:", result.exit_code)
485+
print("Mixed states test - output:", result.output)
486+
assert result.exit_code == 1 # Should exit 1 if ANY storms detected
487+
assert result.output == ""
488+
489+
def test_pfcwd_show_stats_normal_output_unchanged(self):
490+
""" Test that normal stats output is unchanged when not using --check-storm """
491+
import pfcwd.main as pfcwd
492+
from unittest.mock import patch
493+
runner = CliRunner()
494+
db = Db()
495+
496+
# Mock the collect_stats method to ensure consistent test output
497+
def mock_collect_stats_normal(self, empty, queues, storm_only=False):
498+
# Create fake table data with mixed states (like normal operation)
499+
self.table = [
500+
['Ethernet0:3', 'operational', '2/2', '100/100', '100/100', '0/0', '0/0'],
501+
['Ethernet4:3', 'operational', '3/3', '150/150', '150/150', '0/0', '0/0']
502+
]
503+
504+
with patch.object(pfcwd.PfcwdCli, 'collect_stats', mock_collect_stats_normal):
505+
# Test normal stats command (without --check-storm) - should work as before
506+
result = runner.invoke(
507+
pfcwd.cli.commands["show"].commands["stats"],
508+
obj=db
509+
)
510+
print("Normal output test - exit code:", result.exit_code)
511+
print("Normal output test - output length:", len(result.output))
512+
assert result.exit_code == 0
513+
# Should have normal tabulated output (not empty)
514+
assert len(result.output) > 0
515+
assert "QUEUE" in result.output # Should contain table headers
516+
409517
@classmethod
410518
def teardown_class(cls):
411519
os.environ["PATH"] = os.pathsep.join(os.environ["PATH"].split(os.pathsep)[:-1])

0 commit comments

Comments
 (0)