Skip to content

Commit 1f6613f

Browse files
authored
Checker to check if the lag-ids are in sync with chassisdb. (#4082)
What I did This PR introduces a new monitoring script to check synchronization of LAG (Link Aggregation Group) IDs between the chassis database and ASIC databases on VOQ chassis line cards. The script is designed to be run by Monit and will alert via syslog when mismatches are detected. How I did it Key Changes Added chassis_lag_id_checker script that retrieves and compares LAG IDs from chassis_db and asic_db, reporting mismatches per ASIC namespace Comprehensive test suite with fixtures for mocking Redis dumps and ASIC/device configurations Integration into setup.py for proper installation How to verify it test on voq chassis and UT Signed-off-by: Arvindsrinivasan Lakshmi Narasimhan <[email protected]>
1 parent 2920679 commit 1f6613f

3 files changed

Lines changed: 514 additions & 1 deletion

File tree

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
chassis_db_consistency_checker
5+
6+
This script checks for synchronization of LAG (Link Aggregation Group) IDs
7+
between the chassis_db and asic_db on VOQ chassis Linecard.
8+
This script is intended to be run by Monit.
9+
It will write an alerting message into syslog if it finds any mismatches in
10+
LAG IDs between the chassis_db and asic_db.
11+
12+
It performs the following steps:
13+
- Retrieves LAG IDs from the ASIC DBs (per namespace).
14+
- Retrieves the SYSTEM_LAG_ID_TABLE from the chassis DB.
15+
- Compares the LAG IDs in the chassis DB and ASIC DBs to identify mismatches.
16+
- Reports any mismatched LAG keys per ASIC namespace.
17+
- Exits with a non-zero status if mismatches are found.
18+
19+
Intended to be run on line cards (not on the supervisor) of a VOQ chassis
20+
device.
21+
Usage:
22+
python3 chassis_db_consistency_checker [--log-level LEVEL]
23+
24+
Arguments:
25+
--log-level LEVEL Set the logging level (DEBUG, INFO, WARNING, ERROR,
26+
CRITICAL). Default is WARNING.
27+
28+
"""
29+
30+
import subprocess
31+
import json
32+
import logging
33+
import argparse
34+
import sonic_py_common.multi_asic as multi_asic
35+
import sonic_py_common.device_info as device_info
36+
RC_OK = 0
37+
RC_ERR = -1
38+
RC_REDIS_ERR = -2
39+
40+
41+
def run_redis_dump(cmd_args):
42+
"""Run redis-dump with given command arguments and return parsed JSON output."""
43+
try:
44+
result = subprocess.run(cmd_args, capture_output=True, text=True)
45+
logging.debug(f"Command: {cmd_args} output: {result.stdout}")
46+
if result.returncode != 0:
47+
logging.error(f"Command failed: {result.stderr}")
48+
raise RuntimeError(f"Command failed: {result.stderr}")
49+
return json.loads(result.stdout)
50+
except Exception as e:
51+
logging.error(f"Error running redis-dump: {e}")
52+
return {}
53+
54+
55+
def extract_lag_ids_from_asic_db(db_output, key_pattern, lag_id_field):
56+
"""Extract LAG IDs from redis-dump output based on key pattern and field name."""
57+
lag_ids = set()
58+
for key, info in db_output.items():
59+
if key_pattern in key:
60+
lag_id = info.get('value', {}).get(lag_id_field, None)
61+
if lag_id is None:
62+
logging.error(f"{key} has bad lag_id")
63+
lag_ids.add(lag_id)
64+
logging.debug(f"Extracted LAG IDs from ASIC DB: {lag_ids}")
65+
return lag_ids
66+
67+
68+
def extract_table_ids_from_chassis_db(table_output):
69+
"""Extract IDs from a table output (dict of key: id)."""
70+
return set(table_output.values())
71+
72+
73+
def get_lag_ids_asic_namespace(asic_netns):
74+
"""Get LAG IDs from a specific ASIC namespace."""
75+
if asic_netns == multi_asic.DEFAULT_NAMESPACE:
76+
asic_cmd = ["redis-dump", "-d", "1", "-k", "*SAI_OBJECT_TYPE_LAG:*", "-y"]
77+
else:
78+
asic_cmd = [
79+
"sudo", "ip", "netns", "exec", asic_netns,
80+
"redis-dump", "-d", "1", "-k", "*SAI_OBJECT_TYPE_LAG:*", "-y"
81+
]
82+
asic_db_output = run_redis_dump(asic_cmd)
83+
lag_id_ns = extract_lag_ids_from_asic_db(
84+
asic_db_output, "SAI_OBJECT_TYPE_LAG", "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID"
85+
)
86+
logging.debug(f"LAG IDs in ASIC namespace {asic_netns}: {lag_id_ns}")
87+
return lag_id_ns
88+
89+
90+
def get_chassis_lag_db_table():
91+
"""Fetch and return the SYSTEM_LAG_ID_TABLE from chassis_db."""
92+
chassis_db_cmd = [
93+
"redis-dump",
94+
"-H", "redis_chassis.server",
95+
"-p", "6380",
96+
"-d", "12",
97+
"-k", "SYSTEM_LAG_ID_TABLE",
98+
"-y"
99+
]
100+
chassis_db_raw = run_redis_dump(chassis_db_cmd)
101+
chassis_db_table = chassis_db_raw.get('SYSTEM_LAG_ID_TABLE', {}).get('value', {})
102+
if not chassis_db_table:
103+
logging.error("No SYSTEM_LAG_ID_TABLE found in chassis_db")
104+
return {}
105+
return chassis_db_table
106+
107+
108+
def compare_lag_ids(lag_ids_in_chassis_db, asic):
109+
lag_ids_in_asic_db = get_lag_ids_asic_namespace(asic)
110+
diff = lag_ids_in_chassis_db - lag_ids_in_asic_db
111+
if not diff:
112+
diff = lag_ids_in_asic_db - lag_ids_in_chassis_db
113+
return diff
114+
115+
116+
def check_lag_id_sync():
117+
"""Check if LAG IDs in chassis_db and asic_db are in sync."""
118+
119+
rc = RC_OK
120+
diff_summary = {}
121+
chassis_db_lag_table = get_chassis_lag_db_table()
122+
if not chassis_db_lag_table:
123+
return RC_ERR, diff_summary
124+
lag_ids_in_chassis_db = extract_table_ids_from_chassis_db(chassis_db_lag_table)
125+
logging.debug(f"LAG IDs in chassis_db: {lag_ids_in_chassis_db}")
126+
127+
asic_namespaces = multi_asic.get_namespace_list()
128+
129+
for asic_namespace in asic_namespaces:
130+
diff = compare_lag_ids(lag_ids_in_chassis_db, asic_namespace)
131+
asic_name = "localhost" if asic_namespace == multi_asic.DEFAULT_NAMESPACE else asic_namespace
132+
# Convert set to list for JSON/logging friendliness
133+
diff_summary[asic_name] = sorted(list(diff))
134+
135+
return rc, diff_summary
136+
137+
138+
def main():
139+
parser = argparse.ArgumentParser(description="Check LAG ID sync between chassis_db and asic_db")
140+
parser.add_argument('--log-level', default='WARNING', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
141+
help='Set the logging level')
142+
args = parser.parse_args()
143+
logging.basicConfig(level=getattr(logging, args.log_level))
144+
145+
if not device_info.is_voq_chassis():
146+
logging.info("Not a voq chassis device. Exiting.....")
147+
return RC_OK
148+
149+
if device_info.is_supervisor():
150+
logging.info("Not supported on supervisor. Exiting....")
151+
return RC_OK
152+
153+
rc, diff_summary = check_lag_id_sync()
154+
if rc != RC_OK:
155+
return rc
156+
157+
mismatches_found = False
158+
for asic, mismatches in diff_summary.items():
159+
if mismatches:
160+
logging.critical(f"Mismatched LAG keys in {asic}: {mismatches}")
161+
mismatches_found = True
162+
163+
if mismatches_found:
164+
logging.critical("Summary of mismatches:\n%s", json.dumps(diff_summary, indent=4))
165+
return RC_ERR
166+
else:
167+
logging.info("All ASICs are in sync with chassis_db")
168+
return RC_OK
169+
170+
171+
if __name__ == "__main__":
172+
main()

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@
196196
'scripts/check_db_integrity.py',
197197
'scripts/sysreadyshow',
198198
'scripts/wredstat',
199-
'scripts/sonic-error-report'
199+
'scripts/sonic-error-report',
200+
'scripts/chassis_db_consistency_checker.py'
200201
],
201202
entry_points={
202203
'console_scripts': [

0 commit comments

Comments
 (0)