Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
233 changes: 193 additions & 40 deletions tests/crm/test_crm.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
logger = logging.getLogger(__name__)

SONIC_RES_UPDATE_TIME = 50
SONIC_RES_CLEANUP_UPDATE_TIME = 20
CONFIG_UPDATE_TIME = 5
FDB_CLEAR_TIMEOUT = 20
ROUTE_COUNTER_POLL_TIMEOUT = 15
CRM_COUNTER_TOLERANCE = 2
ACL_TABLE_NAME = "DATAACL"

RESTORE_CMDS = {"test_crm_route": [],
Expand Down Expand Up @@ -126,7 +131,9 @@ def apply_acl_config(duthost, asichost, test_name, collector, entry_num=1):
pytest.skip("DATAACL does not exist")

# Make sure CRM counters updated
time.sleep(CRM_UPDATE_TIME)
# ACL configuration usually propagates in 3-5 seconds
logger.info("Waiting for ACL configuration to propagate...")
time.sleep(CONFIG_UPDATE_TIME)

collector["acl_tbl_key"] = get_acl_tbl_key(asichost)

Expand Down Expand Up @@ -206,7 +213,9 @@ def apply_fdb_config(duthost, test_name, vlan_id, iface, entry_num):
duthost.command(cmd)

# Make sure CRM counters updated
time.sleep(CRM_UPDATE_TIME)
# FDB entries typically propagate in 3-5 seconds
logger.info("Waiting for FDB entries to propagate...")
time.sleep(CONFIG_UPDATE_TIME)


def get_acl_tbl_key(asichost):
Expand Down Expand Up @@ -315,6 +324,104 @@ def check_crm_stats(cmd, duthost, origin_crm_stats_used, origin_crm_stats_availa
return False


def wait_for_crm_counter_update(cmd, duthost, expected_used, oper_used=">=", timeout=15, interval=1):
"""
Wait for CRM used counter to update by polling.
Raises pytest.fail() if timeout is reached.
"""
last_values = {'used': None, 'avail': None}

def check_counters():
try:
crm_used, crm_avail = get_crm_stats(cmd, duthost)
last_values['used'] = crm_used
last_values['avail'] = crm_avail
used_ok = eval(f"{crm_used} {oper_used} {expected_used}")

if used_ok:
logger.info(f"CRM counter updated: used={crm_used} (expected {oper_used} {expected_used})")
return True
else:
logger.debug(f"Waiting for CRM update: used={crm_used} (expected {oper_used} {expected_used})")
return False
except Exception as e:
logger.debug(f"Error checking CRM stats: {e}")
return False

pytest_assert(wait_until(timeout, interval, 0, check_counters),
f"CRM counter did not reach expected value within {timeout} seconds. "
f"Expected: used {oper_used} {expected_used}, "
f"Actual: used={last_values['used']}, available={last_values['avail']}")


def wait_for_resource_stabilization(get_stats_func, duthost, asic_index, resource_key,
min_expected_used=None, tolerance_percent=5,
timeout=60, interval=5):
"""
Wait for large resource configurations to stabilize by polling.
Raises pytest.fail() if timeout is reached.
"""
logger.info("Waiting for {} resources to stabilize (expecting at least {} used)...".format(
resource_key, min_expected_used if min_expected_used else "N/A"))

stable_count = 0
prev_used = None
start_time = time.time()

def check_stabilized():
nonlocal stable_count, prev_used
try:
stats = get_stats_func(duthost, asic_index)
current_used = stats[resource_key]['used']
current_avail = stats[resource_key]['available']

logger.debug("{} used: {}, available: {}".format(resource_key, current_used, current_avail))

# Check if we've reached minimum expected usage
if min_expected_used and current_used < min_expected_used * (1 - tolerance_percent / 100):
logger.debug("Still adding resources: {} < {} (min expected)".format(
current_used, min_expected_used))
prev_used = current_used
stable_count = 0
return False

# Check if counter is stable (not changing significantly)
if prev_used is not None:
change = abs(current_used - prev_used)
if change <= max(1, current_used * tolerance_percent / 100):
stable_count += 1
if stable_count >= 2: # Stable for 2 consecutive checks
logger.info("{} resources stabilized at used={}, available={}".format(
resource_key, current_used, current_avail))
return True
else:
stable_count = 0

prev_used = current_used
return False
except Exception as e:
logger.debug("Error checking resource stats: {}".format(e))
return False

if wait_until(timeout, interval, 0, check_stabilized):
elapsed = time.time() - start_time
logger.info("{} stabilization took {:.1f} seconds".format(resource_key, elapsed))
else:
# final stats for error message
try:
final_stats = get_stats_func(duthost, asic_index)
final_used = final_stats[resource_key]['used']
final_avail = final_stats[resource_key]['available']
except Exception:
final_used = prev_used
final_avail = "unknown"

pytest.fail("{} resources did not stabilize within {} seconds. "
"Expected min: {}, Actual: used={}, available={}".format(
resource_key, timeout, min_expected_used if min_expected_used else "N/A",
final_used, final_avail))


def generate_neighbors(amount, ip_ver):
""" Generate list of IPv4 or IPv6 addresses """
if ip_ver == "4":
Expand Down Expand Up @@ -453,7 +560,9 @@ def configure_neighbors(amount, interface, ip_ver, asichost, test_name):
iface=interface,
namespace=asichost.namespace))
# Make sure CRM counters updated
time.sleep(CRM_UPDATE_TIME)
# Neighbor additions typically propagate in 3-5 seconds
logger.info("Waiting for neighbor entries to propagate...")
time.sleep(CONFIG_UPDATE_TIME)


def get_entries_num(used, available):
Expand Down Expand Up @@ -562,8 +671,22 @@ def test_crm_route(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_fro
if duthost.facts['asic_type'] == 'broadcom':
check_available_counters = False

# Make sure CRM counters updated
time.sleep(CRM_UPDATE_TIME)
# Helper function to get current route used counter
def get_route_used():
stats = get_crm_resources_fdb_and_ip_route(duthost, enum_frontend_asic_index)
return stats[f'ipv{ip_ver}_route']['used']

# Make sure CRM counters updated - use polling to wait for route counter to update
logger.info(f"Waiting for route counters to update after adding {total_routes} routes...")
expected_min_used = crm_stats_route_used + total_routes - CRM_COUNTER_TOLERANCE

def check_route_added():
return get_route_used() >= expected_min_used

pytest_assert(wait_until(ROUTE_COUNTER_POLL_TIMEOUT, CRM_POLLING_INTERVAL, 0, check_route_added),
f"Route counter did not update after adding {total_routes} routes "
f"within {ROUTE_COUNTER_POLL_TIMEOUT} seconds. "
f"Expected: used >= {expected_min_used}, Actual: used={get_route_used()}")

# Get new ipv[4/6]_route/fdb_entry used and available counter value
crm_stats = get_crm_resources_fdb_and_ip_route(duthost, enum_frontend_asic_index)
Expand Down Expand Up @@ -596,8 +719,17 @@ def test_crm_route(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_fro
for i in range(total_routes):
duthost.command(route_del_cmd.format(asichost.ip_cmd, i, nh_ip))

# Make sure CRM counters updated
time.sleep(CRM_UPDATE_TIME)
# Make sure CRM counters updated - use polling to wait for route counter to update
logger.info(f"Waiting for route counters to update after deleting {total_routes} routes...")
expected_max_used = crm_stats_route_used - total_routes + CRM_COUNTER_TOLERANCE

def check_route_deleted():
return get_route_used() <= expected_max_used

pytest_assert(wait_until(ROUTE_COUNTER_POLL_TIMEOUT, CRM_POLLING_INTERVAL, 0, check_route_deleted),
f"Route counter did not update after deleting {total_routes} routes "
f"within {ROUTE_COUNTER_POLL_TIMEOUT} seconds. "
f"Expected: used <= {expected_max_used}, Actual: used={get_route_used()}")

# Get new ipv[4/6]_route/fdb_entry used and available counter value
crm_stats = get_crm_resources_fdb_and_ip_route(duthost, enum_frontend_asic_index)
Expand Down Expand Up @@ -644,11 +776,14 @@ def test_crm_route(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_fro
duthost.shell(add_routes_template.render(routes_list=routes_list,
interface=crm_interface[0],
namespace=asichost.namespace))
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
# Make sure SONIC configure expected entries
time.sleep(SONIC_RES_UPDATE_TIME)
# Wait for resources to stabilize using adaptive polling
expected_routes = new_crm_stats_route_used + routes_num
logger.info("Waiting for {} route resources to stabilize".format(routes_num))
wait_for_resource_stabilization(get_crm_resources_fdb_and_ip_route, duthost,
enum_frontend_asic_index, 'ipv{}_route'.format(ip_ver),
min_expected_used=expected_routes, timeout=60, interval=5)

RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME

# Verify thresholds for "IPv[4/6] route" CRM resource
# Get "crm_stats_ipv[4/6]_route" used and available counter value
Expand Down Expand Up @@ -770,11 +905,14 @@ def test_crm_nexthop(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
configure_neighbors(amount=neighbours_num, interface=crm_interface[0], ip_ver=ip_ver,
asichost=asichost, test_name="test_crm_nexthop")

logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
# Make sure SONIC configure expected entries
time.sleep(SONIC_RES_UPDATE_TIME)
# Wait for nexthop resources to stabilize using polling
expected_nexthop_used = new_crm_stats_nexthop_used + neighbours_num - CRM_COUNTER_TOLERANCE
logger.info("Waiting for {} nexthop resources to stabilize (expecting ~{} total used)...".format(
neighbours_num, expected_nexthop_used))
wait_for_crm_counter_update(get_nexthop_stats, duthost, expected_used=expected_nexthop_used,
oper_used=">=", timeout=60, interval=5)

RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME

# Verify thresholds for "IPv[4/6] nexthop" CRM resource
verify_thresholds(duthost, asichost, crm_cli_res="ipv{ip_ver} nexthop".format(ip_ver=ip_ver),
Expand Down Expand Up @@ -853,11 +991,13 @@ def test_crm_neighbor(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
configure_neighbors(amount=neighbours_num, interface=crm_interface[0], ip_ver=ip_ver,
asichost=asichost, test_name="test_crm_neighbor")

logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
# Make sure SONIC configure expected entries
time.sleep(SONIC_RES_UPDATE_TIME)
# Wait for neighbor resources to stabilize using polling
expected_neighbor_used = new_crm_stats_neighbor_used + neighbours_num - CRM_COUNTER_TOLERANCE
logger.info("Waiting for {} neighbor resources to stabilize".format(neighbours_num))
wait_for_crm_counter_update(get_neighbor_stats, duthost, expected_used=expected_neighbor_used,
oper_used=">=", timeout=60, interval=5)

RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME

# Verify thresholds for "IPv[4/6] neighbor" CRM resource
verify_thresholds(duthost, asichost, crm_cli_res="ipv{ip_ver} neighbor".format(ip_ver=ip_ver),
Expand Down Expand Up @@ -976,11 +1116,13 @@ def test_crm_nexthop_group(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
asichost=asichost, test_name="test_crm_nexthop_group",
chunk_size=chunk_size)

logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
# Make sure SONIC configure expected entries
time.sleep(SONIC_RES_UPDATE_TIME)
# Wait for nexthop group resources to stabilize using polling
expected_nhg_used = new_nexthop_group_used + nexthop_group_num - CRM_COUNTER_TOLERANCE
logger.info("Waiting for {} nexthop group resources to stabilize".format(nexthop_group_num))
wait_for_crm_counter_update(get_nexthop_group_stats, duthost, expected_used=expected_nhg_used,
oper_used=">=", timeout=60, interval=5)

RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME

verify_thresholds(duthost, asichost, crm_cli_res=redis_threshold, crm_cmd=get_nexthop_group_stats)

Expand Down Expand Up @@ -1058,8 +1200,11 @@ def verify_acl_crm_stats(duthost, asichost, enum_rand_one_per_hwsku_frontend_hos
logger.info(f"Next hop group number: {nexthop_group_num}")
apply_acl_config(duthost, asichost, "test_acl_entry", asic_collector, nexthop_group_num)

# Make sure SONIC configure expected entries
time.sleep(SONIC_RES_UPDATE_TIME)
# Wait for ACL entry resources to stabilize using polling
expected_acl_used = new_crm_stats_acl_entry_used + nexthop_group_num - CRM_COUNTER_TOLERANCE
logger.info("Waiting for {} ACL entry resources to stabilize".format(nexthop_group_num))
wait_for_crm_counter_update(get_acl_entry_stats, duthost, expected_used=expected_acl_used,
oper_used=">=", timeout=60, interval=5)

# Verify thresholds for "ACL entry" CRM resource
verify_thresholds(duthost, asichost, crm_cli_res="acl group entry", crm_cmd=get_acl_entry_stats)
Expand Down Expand Up @@ -1179,12 +1324,14 @@ def test_acl_counter(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_f
crm_stats_acl_entry_available".format(db_cli=asichost.sonic_db_cli, acl_tbl_key=acl_tbl_key)
_, available_acl_entry_num = get_crm_stats(get_acl_entry_stats, duthost)
# The number we can applied is limited to available_acl_entry_num
apply_acl_config(duthost, asichost, "test_acl_counter", asic_collector,
min(needed_acl_counter_num, available_acl_entry_num))
actual_acl_count = min(needed_acl_counter_num, available_acl_entry_num)
apply_acl_config(duthost, asichost, "test_acl_counter", asic_collector, actual_acl_count)

logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
# Make sure SONIC configure expected entries
time.sleep(SONIC_RES_UPDATE_TIME)
# Wait for ACL counter resources to stabilize using polling
expected_acl_counter_used = new_crm_stats_acl_counter_used + actual_acl_count - CRM_COUNTER_TOLERANCE
logger.info("Waiting for {} ACL counter resources to stabilize".format(actual_acl_count))
wait_for_crm_counter_update(get_acl_counter_stats, duthost, expected_used=expected_acl_counter_used,
oper_used=">=", timeout=60, interval=5)

new_crm_stats_acl_counter_used, new_crm_stats_acl_counter_available = \
get_crm_stats(get_acl_counter_stats, duthost)
Expand Down Expand Up @@ -1298,11 +1445,14 @@ def test_crm_fdb_entry(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum
fdb_entries_num = get_entries_num(new_crm_stats_fdb_entry_used, new_crm_stats_fdb_entry_available)
# Generate FDB json file with 'fdb_entries_num' entries and apply it on DUT
apply_fdb_config(duthost, "test_crm_fdb_entry", vlan_id, iface, fdb_entries_num)
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
# Make sure SONIC configure expected entries
time.sleep(SONIC_RES_UPDATE_TIME)

RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
# Wait for FDB entry resources to stabilize using polling
expected_fdb_used = new_crm_stats_fdb_entry_used + fdb_entries_num - CRM_COUNTER_TOLERANCE
logger.info("Waiting for {} FDB entry resources to stabilize".format(fdb_entries_num))
wait_for_crm_counter_update(get_fdb_stats, duthost, expected_used=expected_fdb_used,
oper_used=">=", timeout=60, interval=5)

RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME

# Verify thresholds for "FDB entry" CRM resource
verify_thresholds(duthost, asichost, crm_cli_res="fdb", crm_cmd=get_fdb_stats)
Expand All @@ -1311,16 +1461,19 @@ def test_crm_fdb_entry(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum
cmd = "fdbclear"
duthost.command(cmd)

# Make sure CRM counters updated
time.sleep(CRM_UPDATE_TIME)
# Timeout for asyc fdb clear
FDB_CLEAR_TIMEOUT = 10
while FDB_CLEAR_TIMEOUT > 0:
# Make sure CRM counters updated - use polling
# Wait for FDB clear to complete with async polling
logger.info("Waiting for FDB clear to complete...")
fdb_clear_timeout = FDB_CLEAR_TIMEOUT
new_crm_stats_fdb_entry_used = None
new_crm_stats_fdb_entry_available = None
while fdb_clear_timeout > 0:
# Get new "crm_stats_fdb_entry" used and available counter value
new_crm_stats_fdb_entry_used, new_crm_stats_fdb_entry_available = get_crm_stats(get_fdb_stats, duthost)
if new_crm_stats_fdb_entry_used == 0:
logger.debug("FDB cleared successfully")
break
FDB_CLEAR_TIMEOUT -= CRM_POLLING_INTERVAL
fdb_clear_timeout -= CRM_POLLING_INTERVAL
time.sleep(CRM_POLLING_INTERVAL)

# Verify "crm_stats_fdb_entry_used" counter was decremented
Expand Down
Loading