Skip to content

Commit ff952fd

Browse files
committed
[warm upgrade] Catch the regression before it becomes a problem - mine and police the protocol convergence timings (sonic-net#23114)
This PR refactors and improves the control plane session recovery gating logic during SONiC image upgrades. The logic is now modular, data-driven, and privacy-compliant, with all thresholds managed in a dedicated JSON file. Debug and legacy code are removed, and a minimal dummy data example is provided for public documentation. How did you do it? Centralized all control plane session recovery gating logic in a helper file Created a structured JSON file for thresholds, supporting multiple HwSKUs and version pairs Removed all debug and legacy code Added logging for unknown HwSKUs and missing thresholds Provided a minimal dummy JSON for public PRs How did you verify/test it? Unit and integration tested on multiple HwSKUs and version pairs Validated JSON structure and gating logic with both real and dummy data Confirmed correct logging and fallback behavior for unknown/missing data Test result: CI pipeline ID - 1066532 Signed-off-by: Ravali Yeluri (WIPRO LIMITED) <[email protected]>
1 parent 7ebaddc commit ff952fd

3 files changed

Lines changed: 111 additions & 0 deletions

File tree

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import os
2+
import re
3+
import json
4+
import logging
5+
6+
7+
def controlplane_gating(reboot_timing_dict):
8+
THRESHOLDS_FILE = os.path.join(os.path.dirname(__file__), 'hwsku_session_thresholds.json')
9+
LACP_WIGGLE_ROOM = 10.0 # seconds
10+
BGP_WIGGLE_ROOM = 10.0 # seconds
11+
12+
def _get_float(d, key):
13+
val = d.get(key)
14+
try:
15+
return float(val) if val is not None and str(val).strip() != "" else None
16+
except (ValueError, TypeError):
17+
return None
18+
19+
def _extract_version(val):
20+
m = re.search(r"(\d{6})", str(val))
21+
return m.group(1) if m else None
22+
23+
hwsku = reboot_timing_dict.get("HwSku")
24+
25+
try:
26+
with open(THRESHOLDS_FILE, 'r') as f:
27+
thresholds = json.load(f)
28+
except Exception:
29+
thresholds = {}
30+
31+
if hwsku not in thresholds:
32+
logging.warning(
33+
"HwSku=%s not found in thresholds file. Skipping controlplane gating.",
34+
hwsku
35+
)
36+
return []
37+
38+
lacp_val = _get_float(reboot_timing_dict, "lacp_session_max_wait")
39+
bgp_val = _get_float(reboot_timing_dict, "bgp")
40+
base_version = _extract_version(reboot_timing_dict.get("BaseImage"))
41+
target_version = _extract_version(reboot_timing_dict.get("TargetImage"))
42+
43+
try:
44+
lacp_avg = float(thresholds[hwsku][base_version][target_version]["LACP"]["AVG"])
45+
except Exception:
46+
lacp_avg = None
47+
try:
48+
bgp_avg = float(thresholds[hwsku][base_version][target_version]["BGP"]["AVG"])
49+
except Exception:
50+
bgp_avg = None
51+
52+
if lacp_avg is None or bgp_avg is None:
53+
logging.warning(
54+
"No thresholds found for HwSku=%s, BaseImage=%s, TargetImage=%s. "
55+
"Skipping controlplane gating.", hwsku, base_version, target_version
56+
)
57+
return []
58+
59+
checks = [
60+
("LACP", lacp_val, lacp_avg, LACP_WIGGLE_ROOM),
61+
("BGP", bgp_val, bgp_avg, BGP_WIGGLE_ROOM),
62+
]
63+
gating_failures = []
64+
for label, val, avg, wiggle in checks:
65+
if val is not None and avg is not None and val > (avg + wiggle):
66+
gating_failures.append(
67+
f"{label} session recovery {val:.2f}s exceeded allowed threshold "
68+
f"(AVG + wiggle room): {avg:.2f}s + {wiggle:.2f}s "
69+
f"for {hwsku} {base_version}->{target_version}"
70+
)
71+
logging.info("Gating failure: %s", gating_failures[-1])
72+
return gating_failures

tests/common/platform/device_utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
OFFSET_ITEMS, TIME_SPAN_ITEMS, REQUIRED_PATTERNS
2121
from tests.common.devices.duthosts import DutHosts
2222
from tests.common.plugins.ansible_fixtures import ansible_adhoc # noqa: F401
23+
from tests.common.platform.controlplane_gating import controlplane_gating
2324

2425
"""
2526
Helper script for fanout switch operations
@@ -1023,6 +1024,17 @@ def post_reboot_analysis(marker, event_counters=None, reboot_oper=None, log_dir=
10231024
report_file_name = request.node.name + "_report.json"
10241025
summary_file_name = request.node.name + "_summary.json"
10251026

1027+
# Prepare minimal dict for control plane gating logic
1028+
gating_input = {
1029+
"lacp_session_max_wait": result_summary.get("controlplane", {}).get("lacp_session_max_wait"),
1030+
"bgp": result_summary.get("time_span", {}).get("bgp"),
1031+
"HwSku": result_summary.get("hwsku"),
1032+
"BaseImage": result_summary.get("base_ver"),
1033+
"TargetImage": result_summary.get("target_ver")
1034+
}
1035+
# Run control plane gating
1036+
gating_failures = controlplane_gating(gating_input)
1037+
10261038
report_file_dir = os.path.realpath((os.path.join(os.path.dirname(__file__),
10271039
"../../logs/platform_tests/")))
10281040
report_file_path = report_file_dir + "/" + report_file_name
@@ -1036,6 +1048,9 @@ def post_reboot_analysis(marker, event_counters=None, reboot_oper=None, log_dir=
10361048

10371049
# After generating timing data report, do some checks on the timing data
10381050
verification_errors = list()
1051+
# Append the gating failures
1052+
if gating_failures:
1053+
verification_errors.extend(gating_failures)
10391054
verify_mac_jumping(test_name, analyze_result, verification_errors)
10401055
if duthost.facts['platform'] != 'x86_64-kvm_x86_64-r0':
10411056
# TBD: expand this verification to KVM - extra port events in KVM which need to be filtered
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"Example-HwSKU-Model-A": {
3+
"202405": {
4+
"202411": {
5+
"LACP": {"AVG": 210.0, "P95": 225.0, "MAX": 240.0},
6+
"BGP": {"AVG": 205.0, "P95": 220.0, "MAX": 235.0}
7+
}
8+
},
9+
"202411": {
10+
"202505": {
11+
"LACP": {"AVG": 215.0, "P95": 230.0, "MAX": 245.0},
12+
"BGP": {"AVG": 208.0, "P95": 222.0, "MAX": 238.0}
13+
}
14+
}
15+
},
16+
"Example-HwSKU-Model-B": {
17+
"202405": {
18+
"202411": {
19+
"LACP": {"AVG": 200.0, "P95": 218.0, "MAX": 232.0},
20+
"BGP": {"AVG": 203.0, "P95": 215.0, "MAX": 228.0}
21+
}
22+
}
23+
}
24+
}

0 commit comments

Comments
 (0)