Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ansible/config_sonic_basedon_testbed.yml
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,7 @@
macsec_profile: "{{ macsec_profile }}"
num_asics: "{{ num_asics }}"
become: true
when: "('t2' in topo) and (enable_macsec is defined)"
when: "('t2' in topo) and (macsec_profile is defined)"

- name: Use minigraph case
block:
Expand Down
4 changes: 1 addition & 3 deletions ansible/templates/golden_config_db_t2.j2
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
{% endif %}
{%- endfor -%}
{% else %}
{
"MACSEC_PROFILE": {
"MACSEC_PROFILE": {
"{{macsec_profile}}": {
"priority": "{{priority}}",
"cipher_suite": "{{cipher_suite}}",
Expand All @@ -34,6 +33,5 @@
"send_sci": "{{send_sci}}"
}
}
},
{%- endif -%}
}
58 changes: 58 additions & 0 deletions tests/common/helpers/dut_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,64 @@ def clear_failed_flag_and_restart(duthost, container_name):
pytest_assert(restarted, "Failed to restart container '{}' after reset-failed was cleared".format(container_name))


def restart_service_with_startlimit_guard(duthost, service_name, backoff_seconds=30, verify_timeout=180):
"""
Restart a systemd-managed service with StartLimitHit guard.

Strategy:
0) Pre-detect StartLimitHit and, if present, skip a failing restart
1) When not rate-limited, reset-failed to clear stale counters and try restart
2) If restart fails, rate-limit is detected, or container isn't running:
- 'systemctl reset-failed <service>.service'
- fixed backoff (default 30s when rate-limited, 1s otherwise)
- 'systemctl start <service>.service'
- wait until container is running

Returns: True when the service is (re)started and running; asserts on failure.
"""

# 0) Pre-detect StartLimitHit so we can optionally skip a failing restart
pre_rate_limited = is_hitting_start_limit(duthost, service_name)

if not pre_rate_limited:
# 1) Proactively clear stale failure counters and try a normal restart
duthost.shell(
f"sudo systemctl reset-failed {service_name}.service",
module_ignore_errors=True
)
ret = duthost.shell(
f"sudo systemctl restart {service_name}.service",
module_ignore_errors=True
)
rate_limited = is_hitting_start_limit(duthost, service_name)
else:
logger.info(
f"StartLimitHit pre-detected for {service_name}, applying reset-failed and "
f"fixed backoff {backoff_seconds}s before start"
)
# Force the recovery path below without attempting an immediate restart.
ret = {"rc": 1}
rate_limited = True

# 2/3) Recovery path: reset-failed + backoff + start if needed
if ret.get("rc", 1) != 0 or rate_limited or not is_container_running(duthost, service_name):
duthost.shell(
f"sudo systemctl reset-failed {service_name}.service",
module_ignore_errors=True
)
time.sleep(backoff_seconds if rate_limited else 1)
duthost.shell(
f"sudo systemctl start {service_name}.service",
module_ignore_errors=True
)
pytest_assert(
wait_until(verify_timeout, 1, 0, check_container_state, duthost, service_name, True),
f"{service_name} container did not become running after recovery start"
)

return True


def get_group_program_info(duthost, container_name, group_name):
"""Gets program names, running status and their pids by analyzing the command
output of "docker exec <container_name> supervisorctl status". Program name
Expand Down
20 changes: 20 additions & 0 deletions tests/common/macsec/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,27 @@ def macsec_setup(self, startup_macsec, shutdown_macsec, macsec_feature):

@pytest.fixture(scope="module", autouse=True)
def load_macsec_info(self, request, macsec_duthost, ctrl_links, macsec_profile, tbinfo):
"""Pre-load MACsec session info for all control links.

If MACsec is enabled and configured for this DUT/profile, wait for
MKA establishment (APP/STATE DB populated with SC/SA, including SAK)
before calling ``load_all_macsec_info``. This avoids races where
``get_macsec_attr`` hits APP_DB before the egress SA row (and ``sak``)
has been written by wpa_supplicant.
"""

if get_macsec_enable_status(macsec_duthost) and get_macsec_profile(macsec_duthost):
# Ensure MKA sessions are established (SC/SA present in DB) if the
# test environment provides the wait_mka_establish fixture
# (defined in tests/macsec/conftest.py). For environments that do
# not define it, fall back to the original behaviour.
try:
request.getfixturevalue('wait_mka_establish')
except pytest.FixtureLookupError:
# Some environments do not define wait_mka_establish; fall back
# to the original behaviour when the fixture is missing.
pass

if is_macsec_configured(macsec_duthost, macsec_profile, ctrl_links):
load_all_macsec_info(macsec_duthost, ctrl_links, tbinfo)
else:
Expand Down
108 changes: 105 additions & 3 deletions tests/common/macsec/macsec_config_helper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import logging
import time

from tests.common.macsec.macsec_helper import get_mka_session, getns_prefix, wait_all_complete, \
submit_async_task, load_all_macsec_info
from tests.common.macsec.macsec_platform_helper import global_cmd, find_portchannel_from_member, get_portchannel
Expand All @@ -17,7 +16,8 @@
'enable_macsec_port',
'disable_macsec_port',
'get_macsec_enable_status',
'get_macsec_profile'
'get_macsec_profile',
'wait_for_macsec_cleanup'
]

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -219,9 +219,21 @@ def cleanup_macsec_configuration(duthost, ctrl_links, profile_name):
submit_async_task(delete_macsec_profile, (d, None, profile_name))
wait_all_complete(timeout=300)

logger.info("Cleanup macsec configuration step3: wait for automatic cleanup")

# Extract DUT interface names from ctrl_links and wait for automatic
# MACsec cleanup on the DUT side.
interfaces = list(ctrl_links.keys())
wait_for_macsec_cleanup(duthost, interfaces)

# Also wait for neighbor devices to complete automatic cleanup for their
# corresponding ports.
for dut_port, nbr in list(ctrl_links.items()):
wait_for_macsec_cleanup(nbr["host"], [nbr["port"]])

logger.info("Cleanup macsec configuration finished")

# Waiting for all mka session were cleared in all devices
# Waiting for all MKA sessions to be cleared on neighbor devices.
for d in devices:
if isinstance(d, EosHost):
continue
Expand Down Expand Up @@ -268,3 +280,93 @@ def setup_macsec_configuration(duthost, ctrl_links, profile_name, default_priori

# Load the MACSEC_INFO, to have data of all macsec sessions
load_all_macsec_info(duthost, ctrl_links, tbinfo)


def wait_for_macsec_cleanup(host, interfaces, timeout=90):
"""Wait for MACsec daemon to automatically clean up all MACsec entries.

This function implements proper synchronization to wait for the automatic
cleanup process to complete, preserving the intended MACsec cleanup behavior.

Args:
host: SONiC DUT or neighbor host object
interfaces: List of interface names to check
timeout: Maximum time to wait in seconds for MACsec cleanup to finish (default: 90).

Returns:
bool: True if cleanup completed, False if timeout
"""
if isinstance(host, EosHost):
# EOS hosts don't use Redis databases
logger.info("EOS host detected, skipping Redis cleanup verification")
return True

logger.info(f"Waiting for automatic MACsec cleanup (timeout: {timeout}s)")

start_time = time.time()
# Poll at most ~10 times over the full timeout, capped at 10 seconds between checks.
poll_interval = min(10, max(1, timeout / 10.0))

# We only care about APPL_DB and STATE_DB for MACsec tables. Instead of
# trying to reverse-engineer numeric DB IDs from CONFIG_DB, rely on
# sonic-db-cli with logical DB names and the same namespace logic used
# elsewhere in MACsec helpers.

while time.time() - start_time < timeout:
all_clean = True
remaining_entries = {}

for interface in interfaces:
ns_prefix = getns_prefix(host, interface)

for db_name, sep in (("APPL_DB", ":"), ("STATE_DB", "|")):
pattern = f"MACSEC_*{sep}{interface}*"
cmd = f"sonic-db-cli {ns_prefix} {db_name} KEYS '{pattern}'"

try:
result = host.command(cmd, verbose=False)
out_lines = result.get("stdout_lines", [])
except Exception as e:
logger.warning(
"Failed to query MACsec keys on host %s, DB %s, interface %s: %r",
getattr(host, 'hostname', host),
db_name,
interface,
e,
)
# If we cannot query Redis for this DB/interface, be
# conservative and assume cleanup is not complete yet.
all_clean = False
continue

keys = [k.strip() for k in out_lines if k.strip()]
if keys:
all_clean = False
remaining_entries.setdefault((db_name, interface), []).extend(keys)

elapsed = time.time() - start_time

if all_clean:
logger.info(
f"Automatic MACsec cleanup completed successfully in {elapsed:.1f}s"
)
return True

# Log progress every 30 seconds to reduce verbosity
if int(elapsed) % 30 == 0 and elapsed > 0:
logger.info(f"Still waiting for cleanup... ({elapsed:.0f}s elapsed)")

time.sleep(poll_interval)

# Timeout reached
elapsed = time.time() - start_time
logger.warning(f"Automatic MACsec cleanup timeout after {elapsed:.1f}s")

# Log summary of remaining entries
total_remaining = sum(len(entries) for entries in remaining_entries.values())
if total_remaining > 0:
logger.warning(
f" {total_remaining} MACsec entries still remain after timeout"
)

return False
4 changes: 3 additions & 1 deletion tests/macsec/test_docker_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from tests.common.utilities import wait_until
from tests.common.macsec.macsec_helper import check_appl_db
from tests.common.helpers.dut_utils import restart_service_with_startlimit_guard


logger = logging.getLogger(__name__)

Expand All @@ -17,6 +19,6 @@ def test_restart_macsec_docker(duthosts, ctrl_links, policy, cipher_suite, send_
duthost = duthosts[enum_rand_one_per_hwsku_macsec_frontend_hostname]

logger.info(duthost.shell(cmd="docker ps", module_ignore_errors=True)['stdout'])
duthost.restart_service("macsec")
restart_service_with_startlimit_guard(duthost, "macsec", backoff_seconds=35, verify_timeout=180)
logger.info(duthost.shell(cmd="docker ps", module_ignore_errors=True)['stdout'])
assert wait_until(300, 6, 12, check_appl_db, duthost, ctrl_links, policy, cipher_suite, send_sci)
Loading