Skip to content

Commit a1e5327

Browse files
deepak-singhal0408amulyan7
authored andcommitted
TSA-TSB Multi-Asic LC Race Condition: New testcase (sonic-net#17318)
What is the motivation for this PR? Issue details in sonic-net#17128 Wrote a testcase to catch such issue in future How did you do it? Restart the startup_tsa_tsb service multiple times with varying intervals and validate its behavior at each step. How did you verify/test it? Execute the new testcase on a multi-asic linecard setup
1 parent d7d0980 commit a1e5327

1 file changed

Lines changed: 85 additions & 0 deletions

File tree

tests/bgp/test_startup_tsa_tsb_service.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
import datetime
33
import threading
4+
import time
45

56
import pytest
67
from tests.common import reboot, config_reload
@@ -33,6 +34,7 @@
3334

3435
SSH_STATE_ABSENT = "absent"
3536
SSH_STATE_STARTED = "started"
37+
TSA_TSB_SERVICE = "startup_tsa_tsb.service"
3638

3739
lock = threading.Lock()
3840
_cached_frontend_nodes = None
@@ -1665,3 +1667,86 @@ def config_reload_linecard_if_unhealthy(lc):
16651667
reboot_cause = get_reboot_cause(suphost)
16661668
pytest_assert(reboot_cause == COLD_REBOOT_CAUSE,
16671669
"Reboot cause {} did not match the trigger {}".format(reboot_cause, COLD_REBOOT_CAUSE))
1670+
1671+
1672+
@pytest.mark.disable_loganalyzer
1673+
def test_tsa_tsb_service_consistency(request, duthosts):
1674+
"""
1675+
Restart the startup_tsa_tsb service multiple times with varying intervals
1676+
and validate its behavior at each step.
1677+
1678+
Steps:
1679+
1. Restart the service twice per iteration with a specified interval in between.
1680+
2. Verify that the service is running after the last restart.
1681+
3. Ensure the DUT enters TSA (maintenance) state when the service is running.
1682+
4. Execute the TSB command and confirm the service stops as expected.
1683+
5. Verify that the DUT transitions back to the normal state after TSB.
1684+
6. If the DUT does not return to normal, perform a config reload.
1685+
1686+
This test is parameterized with different restart intervals to check
1687+
service consistency under various conditions.
1688+
"""
1689+
1690+
frontend_nodes_per_hwsku = get_frontend_nodes_per_hwsku(duthosts, request)
1691+
masic_linecard = None
1692+
1693+
for lc in frontend_nodes_per_hwsku:
1694+
if not check_tsa_persistence_support(lc):
1695+
pytest.skip("TSA persistence not supported in the image")
1696+
1697+
if lc.is_multi_asic:
1698+
masic_linecard = lc
1699+
break
1700+
1701+
if not masic_linecard:
1702+
pytest.skip("No multi-ASIC linecard found in the testbed")
1703+
1704+
tsa_tsb_timer = {}
1705+
tsa_tsb_timer[masic_linecard] = get_startup_tsb_timer(masic_linecard)
1706+
if not tsa_tsb_timer[masic_linecard]:
1707+
pytest.skip("startup_tsa_tsb.service is not supported on {}".format(masic_linecard.hostname))
1708+
1709+
initial_tsa_check_before_and_after_test(duthosts)
1710+
1711+
try:
1712+
def restart_startup_tsa_tsb_and_verify(lc, interval):
1713+
logger.info("Restarting {} startup_tsa_tsb service with interval: {}".format(lc.hostname, interval))
1714+
lc.shell("systemctl restart {}".format(TSA_TSB_SERVICE))
1715+
time.sleep(interval)
1716+
lc.shell('systemctl restart {}'.format(TSA_TSB_SERVICE))
1717+
1718+
# for race condition/inconsistent state(where the second restart will be applying TSA to recover),
1719+
# TSC command check fails the pytest, Hence adding 10 second sleep before TSC is executed
1720+
time.sleep(10)
1721+
1722+
pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'running'),
1723+
"startup_tsa_tsb service is not running after restart")
1724+
1725+
pytest_assert(wait_until(60, 5, 0,
1726+
lambda: get_traffic_shift_state(lc, cmd='TSC no-stats') == TS_MAINTENANCE),
1727+
"DUT is not in maintenance state when startup_tsa_tsb service is running")
1728+
1729+
lc.shell("TSB")
1730+
pytest_assert(wait_until(60, 5, 0, get_tsa_tsb_service_status, lc, 'inactive'),
1731+
"startup_tsa_tsb service did not stop as expected")
1732+
1733+
pytest_assert(wait_until(60, 5, 0, lambda: get_traffic_shift_state(lc, cmd='TSC no-stats') == TS_NORMAL),
1734+
"DUT did not return to normal state after executing TSB")
1735+
1736+
interval = 0
1737+
while interval < 10:
1738+
restart_startup_tsa_tsb_and_verify(masic_linecard, interval)
1739+
time.sleep(10)
1740+
interval += 1
1741+
1742+
finally:
1743+
initial_tsa_check_before_and_after_test(duthosts)
1744+
1745+
def config_reload_linecard_if_unhealthy(lc):
1746+
if not (get_traffic_shift_state(lc) == TS_NORMAL):
1747+
logging.info("DUT's current expected state: {}".format(TS_NORMAL))
1748+
1749+
logging.info("DUT is not in normal state, doing config-reload")
1750+
config_reload(lc, safe_reload=True, check_intf_up_ports=True, exec_tsb=True)
1751+
1752+
config_reload_linecard_if_unhealthy(masic_linecard)

0 commit comments

Comments
 (0)