Skip to content

Commit 81db15e

Browse files
AntonHryshchukravaliyel
authored andcommitted
[Scale|CRM] Optimize test runtime with adaptive polling (sonic-net#22132)
Summary: Replace fixed sleeps with polling and reduce wait times: Add polling helpers: wait_for_crm_counter_update(), wait_for_resource_stabilization() Replace 50s resource waits with adaptive polling Reduce config waits from 10s to 5s (CONFIG_UPDATE_TIME) Reduce cleanup wait from 50s to 20s (SONIC_RES_CLEANUP_UPDATE_TIME) Signed-off-by: AntonHryshchuk <[email protected]> Signed-off-by: Ravali Yeluri (WIPRO LIMITED) <[email protected]>
1 parent 6bc3e00 commit 81db15e

1 file changed

Lines changed: 193 additions & 40 deletions

File tree

tests/crm/test_crm.py

Lines changed: 193 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
logger = logging.getLogger(__name__)
2929

3030
SONIC_RES_UPDATE_TIME = 50
31+
SONIC_RES_CLEANUP_UPDATE_TIME = 20
32+
CONFIG_UPDATE_TIME = 5
33+
FDB_CLEAR_TIMEOUT = 20
34+
ROUTE_COUNTER_POLL_TIMEOUT = 15
35+
CRM_COUNTER_TOLERANCE = 2
3136
ACL_TABLE_NAME = "DATAACL"
3237

3338
RESTORE_CMDS = {"test_crm_route": [],
@@ -126,7 +131,9 @@ def apply_acl_config(duthost, asichost, test_name, collector, entry_num=1):
126131
pytest.skip("DATAACL does not exist")
127132

128133
# Make sure CRM counters updated
129-
time.sleep(CRM_UPDATE_TIME)
134+
# ACL configuration usually propagates in 3-5 seconds
135+
logger.info("Waiting for ACL configuration to propagate...")
136+
time.sleep(CONFIG_UPDATE_TIME)
130137

131138
collector["acl_tbl_key"] = get_acl_tbl_key(asichost)
132139

@@ -206,7 +213,9 @@ def apply_fdb_config(duthost, test_name, vlan_id, iface, entry_num):
206213
duthost.command(cmd)
207214

208215
# Make sure CRM counters updated
209-
time.sleep(CRM_UPDATE_TIME)
216+
# FDB entries typically propagate in 3-5 seconds
217+
logger.info("Waiting for FDB entries to propagate...")
218+
time.sleep(CONFIG_UPDATE_TIME)
210219

211220

212221
def get_acl_tbl_key(asichost):
@@ -315,6 +324,104 @@ def check_crm_stats(cmd, duthost, origin_crm_stats_used, origin_crm_stats_availa
315324
return False
316325

317326

327+
def wait_for_crm_counter_update(cmd, duthost, expected_used, oper_used=">=", timeout=15, interval=1):
328+
"""
329+
Wait for CRM used counter to update by polling.
330+
Raises pytest.fail() if timeout is reached.
331+
"""
332+
last_values = {'used': None, 'avail': None}
333+
334+
def check_counters():
335+
try:
336+
crm_used, crm_avail = get_crm_stats(cmd, duthost)
337+
last_values['used'] = crm_used
338+
last_values['avail'] = crm_avail
339+
used_ok = eval(f"{crm_used} {oper_used} {expected_used}")
340+
341+
if used_ok:
342+
logger.info(f"CRM counter updated: used={crm_used} (expected {oper_used} {expected_used})")
343+
return True
344+
else:
345+
logger.debug(f"Waiting for CRM update: used={crm_used} (expected {oper_used} {expected_used})")
346+
return False
347+
except Exception as e:
348+
logger.debug(f"Error checking CRM stats: {e}")
349+
return False
350+
351+
pytest_assert(wait_until(timeout, interval, 0, check_counters),
352+
f"CRM counter did not reach expected value within {timeout} seconds. "
353+
f"Expected: used {oper_used} {expected_used}, "
354+
f"Actual: used={last_values['used']}, available={last_values['avail']}")
355+
356+
357+
def wait_for_resource_stabilization(get_stats_func, duthost, asic_index, resource_key,
358+
min_expected_used=None, tolerance_percent=5,
359+
timeout=60, interval=5):
360+
"""
361+
Wait for large resource configurations to stabilize by polling.
362+
Raises pytest.fail() if timeout is reached.
363+
"""
364+
logger.info("Waiting for {} resources to stabilize (expecting at least {} used)...".format(
365+
resource_key, min_expected_used if min_expected_used else "N/A"))
366+
367+
stable_count = 0
368+
prev_used = None
369+
start_time = time.time()
370+
371+
def check_stabilized():
372+
nonlocal stable_count, prev_used
373+
try:
374+
stats = get_stats_func(duthost, asic_index)
375+
current_used = stats[resource_key]['used']
376+
current_avail = stats[resource_key]['available']
377+
378+
logger.debug("{} used: {}, available: {}".format(resource_key, current_used, current_avail))
379+
380+
# Check if we've reached minimum expected usage
381+
if min_expected_used and current_used < min_expected_used * (1 - tolerance_percent / 100):
382+
logger.debug("Still adding resources: {} < {} (min expected)".format(
383+
current_used, min_expected_used))
384+
prev_used = current_used
385+
stable_count = 0
386+
return False
387+
388+
# Check if counter is stable (not changing significantly)
389+
if prev_used is not None:
390+
change = abs(current_used - prev_used)
391+
if change <= max(1, current_used * tolerance_percent / 100):
392+
stable_count += 1
393+
if stable_count >= 2: # Stable for 2 consecutive checks
394+
logger.info("{} resources stabilized at used={}, available={}".format(
395+
resource_key, current_used, current_avail))
396+
return True
397+
else:
398+
stable_count = 0
399+
400+
prev_used = current_used
401+
return False
402+
except Exception as e:
403+
logger.debug("Error checking resource stats: {}".format(e))
404+
return False
405+
406+
if wait_until(timeout, interval, 0, check_stabilized):
407+
elapsed = time.time() - start_time
408+
logger.info("{} stabilization took {:.1f} seconds".format(resource_key, elapsed))
409+
else:
410+
# final stats for error message
411+
try:
412+
final_stats = get_stats_func(duthost, asic_index)
413+
final_used = final_stats[resource_key]['used']
414+
final_avail = final_stats[resource_key]['available']
415+
except Exception:
416+
final_used = prev_used
417+
final_avail = "unknown"
418+
419+
pytest.fail("{} resources did not stabilize within {} seconds. "
420+
"Expected min: {}, Actual: used={}, available={}".format(
421+
resource_key, timeout, min_expected_used if min_expected_used else "N/A",
422+
final_used, final_avail))
423+
424+
318425
def generate_neighbors(amount, ip_ver):
319426
""" Generate list of IPv4 or IPv6 addresses """
320427
if ip_ver == "4":
@@ -453,7 +560,9 @@ def configure_neighbors(amount, interface, ip_ver, asichost, test_name):
453560
iface=interface,
454561
namespace=asichost.namespace))
455562
# Make sure CRM counters updated
456-
time.sleep(CRM_UPDATE_TIME)
563+
# Neighbor additions typically propagate in 3-5 seconds
564+
logger.info("Waiting for neighbor entries to propagate...")
565+
time.sleep(CONFIG_UPDATE_TIME)
457566

458567

459568
def get_entries_num(used, available):
@@ -562,8 +671,22 @@ def test_crm_route(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_fro
562671
if duthost.facts['asic_type'] == 'broadcom':
563672
check_available_counters = False
564673

565-
# Make sure CRM counters updated
566-
time.sleep(CRM_UPDATE_TIME)
674+
# Helper function to get current route used counter
675+
def get_route_used():
676+
stats = get_crm_resources_fdb_and_ip_route(duthost, enum_frontend_asic_index)
677+
return stats[f'ipv{ip_ver}_route']['used']
678+
679+
# Make sure CRM counters updated - use polling to wait for route counter to update
680+
logger.info(f"Waiting for route counters to update after adding {total_routes} routes...")
681+
expected_min_used = crm_stats_route_used + total_routes - CRM_COUNTER_TOLERANCE
682+
683+
def check_route_added():
684+
return get_route_used() >= expected_min_used
685+
686+
pytest_assert(wait_until(ROUTE_COUNTER_POLL_TIMEOUT, CRM_POLLING_INTERVAL, 0, check_route_added),
687+
f"Route counter did not update after adding {total_routes} routes "
688+
f"within {ROUTE_COUNTER_POLL_TIMEOUT} seconds. "
689+
f"Expected: used >= {expected_min_used}, Actual: used={get_route_used()}")
567690

568691
# Get new ipv[4/6]_route/fdb_entry used and available counter value
569692
crm_stats = get_crm_resources_fdb_and_ip_route(duthost, enum_frontend_asic_index)
@@ -596,8 +719,17 @@ def test_crm_route(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_fro
596719
for i in range(total_routes):
597720
duthost.command(route_del_cmd.format(asichost.ip_cmd, i, nh_ip))
598721

599-
# Make sure CRM counters updated
600-
time.sleep(CRM_UPDATE_TIME)
722+
# Make sure CRM counters updated - use polling to wait for route counter to update
723+
logger.info(f"Waiting for route counters to update after deleting {total_routes} routes...")
724+
expected_max_used = crm_stats_route_used - total_routes + CRM_COUNTER_TOLERANCE
725+
726+
def check_route_deleted():
727+
return get_route_used() <= expected_max_used
728+
729+
pytest_assert(wait_until(ROUTE_COUNTER_POLL_TIMEOUT, CRM_POLLING_INTERVAL, 0, check_route_deleted),
730+
f"Route counter did not update after deleting {total_routes} routes "
731+
f"within {ROUTE_COUNTER_POLL_TIMEOUT} seconds. "
732+
f"Expected: used <= {expected_max_used}, Actual: used={get_route_used()}")
601733

602734
# Get new ipv[4/6]_route/fdb_entry used and available counter value
603735
crm_stats = get_crm_resources_fdb_and_ip_route(duthost, enum_frontend_asic_index)
@@ -644,11 +776,14 @@ def test_crm_route(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_fro
644776
duthost.shell(add_routes_template.render(routes_list=routes_list,
645777
interface=crm_interface[0],
646778
namespace=asichost.namespace))
647-
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
648-
# Make sure SONIC configure expected entries
649-
time.sleep(SONIC_RES_UPDATE_TIME)
779+
# Wait for resources to stabilize using adaptive polling
780+
expected_routes = new_crm_stats_route_used + routes_num
781+
logger.info("Waiting for {} route resources to stabilize".format(routes_num))
782+
wait_for_resource_stabilization(get_crm_resources_fdb_and_ip_route, duthost,
783+
enum_frontend_asic_index, 'ipv{}_route'.format(ip_ver),
784+
min_expected_used=expected_routes, timeout=60, interval=5)
650785

651-
RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
786+
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME
652787

653788
# Verify thresholds for "IPv[4/6] route" CRM resource
654789
# Get "crm_stats_ipv[4/6]_route" used and available counter value
@@ -770,11 +905,14 @@ def test_crm_nexthop(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
770905
configure_neighbors(amount=neighbours_num, interface=crm_interface[0], ip_ver=ip_ver,
771906
asichost=asichost, test_name="test_crm_nexthop")
772907

773-
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
774-
# Make sure SONIC configure expected entries
775-
time.sleep(SONIC_RES_UPDATE_TIME)
908+
# Wait for nexthop resources to stabilize using polling
909+
expected_nexthop_used = new_crm_stats_nexthop_used + neighbours_num - CRM_COUNTER_TOLERANCE
910+
logger.info("Waiting for {} nexthop resources to stabilize (expecting ~{} total used)...".format(
911+
neighbours_num, expected_nexthop_used))
912+
wait_for_crm_counter_update(get_nexthop_stats, duthost, expected_used=expected_nexthop_used,
913+
oper_used=">=", timeout=60, interval=5)
776914

777-
RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
915+
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME
778916

779917
# Verify thresholds for "IPv[4/6] nexthop" CRM resource
780918
verify_thresholds(duthost, asichost, crm_cli_res="ipv{ip_ver} nexthop".format(ip_ver=ip_ver),
@@ -853,11 +991,13 @@ def test_crm_neighbor(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
853991
configure_neighbors(amount=neighbours_num, interface=crm_interface[0], ip_ver=ip_ver,
854992
asichost=asichost, test_name="test_crm_neighbor")
855993

856-
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
857-
# Make sure SONIC configure expected entries
858-
time.sleep(SONIC_RES_UPDATE_TIME)
994+
# Wait for neighbor resources to stabilize using polling
995+
expected_neighbor_used = new_crm_stats_neighbor_used + neighbours_num - CRM_COUNTER_TOLERANCE
996+
logger.info("Waiting for {} neighbor resources to stabilize".format(neighbours_num))
997+
wait_for_crm_counter_update(get_neighbor_stats, duthost, expected_used=expected_neighbor_used,
998+
oper_used=">=", timeout=60, interval=5)
859999

860-
RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
1000+
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME
8611001

8621002
# Verify thresholds for "IPv[4/6] neighbor" CRM resource
8631003
verify_thresholds(duthost, asichost, crm_cli_res="ipv{ip_ver} neighbor".format(ip_ver=ip_ver),
@@ -976,11 +1116,13 @@ def test_crm_nexthop_group(duthosts, enum_rand_one_per_hwsku_frontend_hostname,
9761116
asichost=asichost, test_name="test_crm_nexthop_group",
9771117
chunk_size=chunk_size)
9781118

979-
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
980-
# Make sure SONIC configure expected entries
981-
time.sleep(SONIC_RES_UPDATE_TIME)
1119+
# Wait for nexthop group resources to stabilize using polling
1120+
expected_nhg_used = new_nexthop_group_used + nexthop_group_num - CRM_COUNTER_TOLERANCE
1121+
logger.info("Waiting for {} nexthop group resources to stabilize".format(nexthop_group_num))
1122+
wait_for_crm_counter_update(get_nexthop_group_stats, duthost, expected_used=expected_nhg_used,
1123+
oper_used=">=", timeout=60, interval=5)
9821124

983-
RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
1125+
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME
9841126

9851127
verify_thresholds(duthost, asichost, crm_cli_res=redis_threshold, crm_cmd=get_nexthop_group_stats)
9861128

@@ -1058,8 +1200,11 @@ def verify_acl_crm_stats(duthost, asichost, enum_rand_one_per_hwsku_frontend_hos
10581200
logger.info(f"Next hop group number: {nexthop_group_num}")
10591201
apply_acl_config(duthost, asichost, "test_acl_entry", asic_collector, nexthop_group_num)
10601202

1061-
# Make sure SONIC configure expected entries
1062-
time.sleep(SONIC_RES_UPDATE_TIME)
1203+
# Wait for ACL entry resources to stabilize using polling
1204+
expected_acl_used = new_crm_stats_acl_entry_used + nexthop_group_num - CRM_COUNTER_TOLERANCE
1205+
logger.info("Waiting for {} ACL entry resources to stabilize".format(nexthop_group_num))
1206+
wait_for_crm_counter_update(get_acl_entry_stats, duthost, expected_used=expected_acl_used,
1207+
oper_used=">=", timeout=60, interval=5)
10631208

10641209
# Verify thresholds for "ACL entry" CRM resource
10651210
verify_thresholds(duthost, asichost, crm_cli_res="acl group entry", crm_cmd=get_acl_entry_stats)
@@ -1179,12 +1324,14 @@ def test_acl_counter(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum_f
11791324
crm_stats_acl_entry_available".format(db_cli=asichost.sonic_db_cli, acl_tbl_key=acl_tbl_key)
11801325
_, available_acl_entry_num = get_crm_stats(get_acl_entry_stats, duthost)
11811326
# The number we can applied is limited to available_acl_entry_num
1182-
apply_acl_config(duthost, asichost, "test_acl_counter", asic_collector,
1183-
min(needed_acl_counter_num, available_acl_entry_num))
1327+
actual_acl_count = min(needed_acl_counter_num, available_acl_entry_num)
1328+
apply_acl_config(duthost, asichost, "test_acl_counter", asic_collector, actual_acl_count)
11841329

1185-
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
1186-
# Make sure SONIC configure expected entries
1187-
time.sleep(SONIC_RES_UPDATE_TIME)
1330+
# Wait for ACL counter resources to stabilize using polling
1331+
expected_acl_counter_used = new_crm_stats_acl_counter_used + actual_acl_count - CRM_COUNTER_TOLERANCE
1332+
logger.info("Waiting for {} ACL counter resources to stabilize".format(actual_acl_count))
1333+
wait_for_crm_counter_update(get_acl_counter_stats, duthost, expected_used=expected_acl_counter_used,
1334+
oper_used=">=", timeout=60, interval=5)
11881335

11891336
new_crm_stats_acl_counter_used, new_crm_stats_acl_counter_available = \
11901337
get_crm_stats(get_acl_counter_stats, duthost)
@@ -1298,11 +1445,14 @@ def test_crm_fdb_entry(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum
12981445
fdb_entries_num = get_entries_num(new_crm_stats_fdb_entry_used, new_crm_stats_fdb_entry_available)
12991446
# Generate FDB json file with 'fdb_entries_num' entries and apply it on DUT
13001447
apply_fdb_config(duthost, "test_crm_fdb_entry", vlan_id, iface, fdb_entries_num)
1301-
logger.info("Waiting {} seconds for SONiC to update resources...".format(SONIC_RES_UPDATE_TIME))
1302-
# Make sure SONIC configure expected entries
1303-
time.sleep(SONIC_RES_UPDATE_TIME)
13041448

1305-
RESTORE_CMDS["wait"] = SONIC_RES_UPDATE_TIME
1449+
# Wait for FDB entry resources to stabilize using polling
1450+
expected_fdb_used = new_crm_stats_fdb_entry_used + fdb_entries_num - CRM_COUNTER_TOLERANCE
1451+
logger.info("Waiting for {} FDB entry resources to stabilize".format(fdb_entries_num))
1452+
wait_for_crm_counter_update(get_fdb_stats, duthost, expected_used=expected_fdb_used,
1453+
oper_used=">=", timeout=60, interval=5)
1454+
1455+
RESTORE_CMDS["wait"] = SONIC_RES_CLEANUP_UPDATE_TIME
13061456

13071457
# Verify thresholds for "FDB entry" CRM resource
13081458
verify_thresholds(duthost, asichost, crm_cli_res="fdb", crm_cmd=get_fdb_stats)
@@ -1311,16 +1461,19 @@ def test_crm_fdb_entry(duthosts, enum_rand_one_per_hwsku_frontend_hostname, enum
13111461
cmd = "fdbclear"
13121462
duthost.command(cmd)
13131463

1314-
# Make sure CRM counters updated
1315-
time.sleep(CRM_UPDATE_TIME)
1316-
# Timeout for asyc fdb clear
1317-
FDB_CLEAR_TIMEOUT = 10
1318-
while FDB_CLEAR_TIMEOUT > 0:
1464+
# Make sure CRM counters updated - use polling
1465+
# Wait for FDB clear to complete with async polling
1466+
logger.info("Waiting for FDB clear to complete...")
1467+
fdb_clear_timeout = FDB_CLEAR_TIMEOUT
1468+
new_crm_stats_fdb_entry_used = None
1469+
new_crm_stats_fdb_entry_available = None
1470+
while fdb_clear_timeout > 0:
13191471
# Get new "crm_stats_fdb_entry" used and available counter value
13201472
new_crm_stats_fdb_entry_used, new_crm_stats_fdb_entry_available = get_crm_stats(get_fdb_stats, duthost)
13211473
if new_crm_stats_fdb_entry_used == 0:
1474+
logger.debug("FDB cleared successfully")
13221475
break
1323-
FDB_CLEAR_TIMEOUT -= CRM_POLLING_INTERVAL
1476+
fdb_clear_timeout -= CRM_POLLING_INTERVAL
13241477
time.sleep(CRM_POLLING_INTERVAL)
13251478

13261479
# Verify "crm_stats_fdb_entry_used" counter was decremented

0 commit comments

Comments
 (0)