diff --git a/scripts/chassis_db_consistency_checker.py b/scripts/chassis_db_consistency_checker.py new file mode 100644 index 0000000000..038a91aeb1 --- /dev/null +++ b/scripts/chassis_db_consistency_checker.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 + +""" +chassis_db_consistency_checker + +This script checks for synchronization of LAG (Link Aggregation Group) IDs +between the chassis_db and asic_db on VOQ chassis Linecard. +This script is intended to be run by Monit. +It will write an alerting message into syslog if it finds any mismatches in +LAG IDs between the chassis_db and asic_db. + +It performs the following steps: +- Retrieves LAG IDs from the ASIC DBs (per namespace). +- Retrieves the SYSTEM_LAG_ID_TABLE from the chassis DB. +- Compares the LAG IDs in the chassis DB and ASIC DBs to identify mismatches. +- Reports any mismatched LAG keys per ASIC namespace. +- Exits with a non-zero status if mismatches are found. + +Intended to be run on line cards (not on the supervisor) of a VOQ chassis +device. +Usage: + python3 chassis_db_consistency_checker [--log-level LEVEL] + +Arguments: + --log-level LEVEL Set the logging level (DEBUG, INFO, WARNING, ERROR, + CRITICAL). Default is WARNING. + +""" + +import subprocess +import json +import logging +import argparse +import sonic_py_common.multi_asic as multi_asic +import sonic_py_common.device_info as device_info +RC_OK = 0 +RC_ERR = -1 +RC_REDIS_ERR = -2 + + +def run_redis_dump(cmd_args): + """Run redis-dump with given command arguments and return parsed JSON output.""" + try: + result = subprocess.run(cmd_args, capture_output=True, text=True) + logging.debug(f"Command: {cmd_args} output: {result.stdout}") + if result.returncode != 0: + logging.error(f"Command failed: {result.stderr}") + raise RuntimeError(f"Command failed: {result.stderr}") + return json.loads(result.stdout) + except Exception as e: + logging.error(f"Error running redis-dump: {e}") + return {} + + +def extract_lag_ids_from_asic_db(db_output, key_pattern, lag_id_field): + """Extract LAG IDs from redis-dump output based on key pattern and field name.""" + lag_ids = set() + for key, info in db_output.items(): + if key_pattern in key: + lag_id = info.get('value', {}).get(lag_id_field, None) + if lag_id is None: + logging.error(f"{key} has bad lag_id") + lag_ids.add(lag_id) + logging.debug(f"Extracted LAG IDs from ASIC DB: {lag_ids}") + return lag_ids + + +def extract_table_ids_from_chassis_db(table_output): + """Extract IDs from a table output (dict of key: id).""" + return set(table_output.values()) + + +def get_lag_ids_asic_namespace(asic_netns): + """Get LAG IDs from a specific ASIC namespace.""" + if asic_netns == multi_asic.DEFAULT_NAMESPACE: + asic_cmd = ["redis-dump", "-d", "1", "-k", "*SAI_OBJECT_TYPE_LAG:*", "-y"] + else: + asic_cmd = [ + "sudo", "ip", "netns", "exec", asic_netns, + "redis-dump", "-d", "1", "-k", "*SAI_OBJECT_TYPE_LAG:*", "-y" + ] + asic_db_output = run_redis_dump(asic_cmd) + lag_id_ns = extract_lag_ids_from_asic_db( + asic_db_output, "SAI_OBJECT_TYPE_LAG", "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID" + ) + logging.debug(f"LAG IDs in ASIC namespace {asic_netns}: {lag_id_ns}") + return lag_id_ns + + +def get_chassis_lag_db_table(): + """Fetch and return the SYSTEM_LAG_ID_TABLE from chassis_db.""" + chassis_db_cmd = [ + "redis-dump", + "-H", "redis_chassis.server", + "-p", "6380", + "-d", "12", + "-k", "SYSTEM_LAG_ID_TABLE", + "-y" + ] + chassis_db_raw = run_redis_dump(chassis_db_cmd) + chassis_db_table = chassis_db_raw.get('SYSTEM_LAG_ID_TABLE', {}).get('value', {}) + if not chassis_db_table: + logging.error("No SYSTEM_LAG_ID_TABLE found in chassis_db") + return {} + return chassis_db_table + + +def compare_lag_ids(lag_ids_in_chassis_db, asic): + lag_ids_in_asic_db = get_lag_ids_asic_namespace(asic) + diff = lag_ids_in_chassis_db - lag_ids_in_asic_db + if not diff: + diff = lag_ids_in_asic_db - lag_ids_in_chassis_db + return diff + + +def check_lag_id_sync(): + """Check if LAG IDs in chassis_db and asic_db are in sync.""" + + rc = RC_OK + diff_summary = {} + chassis_db_lag_table = get_chassis_lag_db_table() + if not chassis_db_lag_table: + return RC_ERR, diff_summary + lag_ids_in_chassis_db = extract_table_ids_from_chassis_db(chassis_db_lag_table) + logging.debug(f"LAG IDs in chassis_db: {lag_ids_in_chassis_db}") + + asic_namespaces = multi_asic.get_namespace_list() + + for asic_namespace in asic_namespaces: + diff = compare_lag_ids(lag_ids_in_chassis_db, asic_namespace) + asic_name = "localhost" if asic_namespace == multi_asic.DEFAULT_NAMESPACE else asic_namespace + # Convert set to list for JSON/logging friendliness + diff_summary[asic_name] = sorted(list(diff)) + + return rc, diff_summary + + +def main(): + parser = argparse.ArgumentParser(description="Check LAG ID sync between chassis_db and asic_db") + parser.add_argument('--log-level', default='WARNING', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + help='Set the logging level') + args = parser.parse_args() + logging.basicConfig(level=getattr(logging, args.log_level)) + + if not device_info.is_voq_chassis(): + logging.info("Not a voq chassis device. Exiting.....") + return RC_OK + + if device_info.is_supervisor(): + logging.info("Not supported on supervisor. Exiting....") + return RC_OK + + rc, diff_summary = check_lag_id_sync() + if rc != RC_OK: + return rc + + mismatches_found = False + for asic, mismatches in diff_summary.items(): + if mismatches: + logging.critical(f"Mismatched LAG keys in {asic}: {mismatches}") + mismatches_found = True + + if mismatches_found: + logging.critical("Summary of mismatches:\n%s", json.dumps(diff_summary, indent=4)) + return RC_ERR + else: + logging.info("All ASICs are in sync with chassis_db") + return RC_OK + + +if __name__ == "__main__": + main() diff --git a/setup.py b/setup.py index d4ec5035c9..28e3f61312 100644 --- a/setup.py +++ b/setup.py @@ -195,7 +195,8 @@ 'scripts/check_db_integrity.py', 'scripts/sysreadyshow', 'scripts/wredstat', - 'scripts/sonic-error-report' + 'scripts/sonic-error-report', + 'scripts/chassis_db_consistency_checker.py' ], entry_points={ 'console_scripts': [ diff --git a/tests/chassis_db_consistency_checker_test.py b/tests/chassis_db_consistency_checker_test.py new file mode 100644 index 0000000000..e53e5c7a15 --- /dev/null +++ b/tests/chassis_db_consistency_checker_test.py @@ -0,0 +1,340 @@ + +import pytest +import sys +import logging +import json +import sonic_py_common.multi_asic as multi_asic +import sonic_py_common.device_info as device_info +sys.path.append("scripts") # noqa: E402 +import chassis_db_consistency_checker # noqa: E402 + +MULTI_ASIC_MISMATCH_LOGS = """CRITICAL root:chassis_db_consistency_checker.py:160 Mismatched LAG keys in asic0: ['264', '265', '266'] +CRITICAL root:chassis_db_consistency_checker.py:160 Mismatched LAG keys in asic1: ['264', '265', '266'] +CRITICAL root:chassis_db_consistency_checker.py:164 Summary of mismatches: +{ + "asic0": [ + "264", + "265", + "266" + ], + "asic1": [ + "264", + "265", + "266" + ] +} +""" +SINGLE_ASIC_MISMATCH_LOGS = """ +CRITICAL root:chassis_db_consistency_checker.py:160 Mismatched LAG keys in localhost: ['264', '265', '266'] +CRITICAL root:chassis_db_consistency_checker.py:164 Summary of mismatches: +{ + "localhost": [ + "264", + "265", + "266" + ] +}""" + + +@pytest.fixture +def mock_run_redis_dump(monkeypatch): + def _mock(cmd_args): + # Return a fake redis-dump output based on command args + if "SAI_OBJECT_TYPE_LAG" in str(cmd_args): + # Simulate ASIC DB output + return { + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b27": { + "expireat": 1764524951.6364665, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "262" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b28": { + "expireat": 1764524951.6364777, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "263" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b29": { + "expireat": 1764524951.636488, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "264" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b2a": { + "expireat": 1764524951.6364946, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "265" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b2b": { + "expireat": 1764524951.636469, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "266" + } + } + } + elif "SYSTEM_LAG_ID_TABLE" in str(cmd_args): + # Simulate Chassis DB output + return { + "SYSTEM_LAG_ID_TABLE": { + "expireat": 1764524950.7635868, + "ttl": -0.001, + "type": "hash", + "value": { + "sonic-lc1-1|asic0|PortChannel112": "262", + "sonic-lc1-1|asic0|PortChannel116": "263", + "sonic-lc2-1|asic0|PortChannel100": "264", + "sonic-lc3-1|asic0|PortChannel149": "265", + "sonic-lc3-1|asic0|PortChannel150": "266", + } + } + } + return {} + monkeypatch.setattr(chassis_db_consistency_checker, "run_redis_dump", _mock) + + +@pytest.fixture +def run_redis_dump_empty(monkeypatch): + def _mock(cmd_args): + return {} + monkeypatch.setattr(chassis_db_consistency_checker, "run_redis_dump", _mock) + + +@pytest.fixture +def mock_run_redis_dump_mismatch(monkeypatch): + def _mock(cmd_args): + # Return a fake redis-dump output based on command args + if "SAI_OBJECT_TYPE_LAG" in str(cmd_args): + # Simulate ASIC DB output + return { + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b27": { + "expireat": 1764524951.6364665, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "262" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b28": { + "expireat": 1764524951.6364777, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "263" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b29": { + "expireat": 1764524951.636488, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "264" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b2a": { + "expireat": 1764524951.6364946, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "265" + } + }, + "ASIC_STATE:SAI_OBJECT_TYPE_LAG:oid:0x102000000000b2b": { + "expireat": 1764524951.636469, + "ttl": -0.001, + "type": "hash", + "value": { + "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "266" + } + } + } + elif "SYSTEM_LAG_ID_TABLE" in str(cmd_args): + # Simulate Chassis DB output + return { + "SYSTEM_LAG_ID_TABLE": { + "expireat": 1764524950.7635868, + "ttl": -0.001, + "type": "hash", + "value": { + "sonic-lc1-1|asic0|PortChannel112": "262", + "sonic-lc1-1|asic0|PortChannel116": "263" + } + } + } + return {} + monkeypatch.setattr(chassis_db_consistency_checker, "run_redis_dump", _mock) + + +@pytest.fixture +def mock_multi_asic(monkeypatch): + monkeypatch.setattr(multi_asic, "get_namespace_list", lambda: ["asic0", "asic1"]) + + +@pytest.fixture +def mock_single_asic(monkeypatch): + monkeypatch.setattr(multi_asic, "get_namespace_list", lambda: [multi_asic.DEFAULT_NAMESPACE]) + + +@pytest.fixture +def mock_device_info(monkeypatch): + monkeypatch.setattr(device_info, "is_voq_chassis", lambda: True) + monkeypatch.setattr(device_info, "is_supervisor", lambda: False) + + +@pytest.fixture +def mock_device_info_no_voq(monkeypatch): + monkeypatch.setattr(device_info, "is_voq_chassis", lambda: False) + + +@pytest.fixture +def mock_device_info_supervisor(monkeypatch): + monkeypatch.setattr(device_info, "is_voq_chassis", lambda: True) + monkeypatch.setattr(device_info, "is_supervisor", lambda: True) + + +def test_extract_lag_ids_from_asic_db(): + db_output = { + "SAI_OBJECT_TYPE_LAG:1": {"value": {"SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "100"}}, + "SAI_OBJECT_TYPE_LAG:2": {"value": {"SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "200"}}, + "OTHER_KEY": {"value": {"SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "999"}} + } + lag_ids = chassis_db_consistency_checker.extract_lag_ids_from_asic_db( + db_output, "SAI_OBJECT_TYPE_LAG", "SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID" + ) + assert "100" in lag_ids + assert "200" in lag_ids + + +def test_extract_table_ids_from_chassis_db(): + table_output = {"PortChannel1": "100", "hostname|asic0|PortChannel2": "200"} + ids = chassis_db_consistency_checker.extract_table_ids_from_chassis_db(table_output) + assert ids == {"100", "200"} + + +def test_compare_lag_ids(mock_run_redis_dump, mock_multi_asic): + lag_ids_in_chassis_db = {"262", "264"} + diff = chassis_db_consistency_checker.compare_lag_ids(lag_ids_in_chassis_db, "asic0") + assert diff == {'263', '265', '266'} + + +def test_check_lag_id_sync(mock_run_redis_dump, mock_multi_asic): + rc, diff_summary = chassis_db_consistency_checker.check_lag_id_sync() + assert rc == 0 + assert {'asic0': [], 'asic1': []} == diff_summary + + +def test_check_no_voq_chassis(monkeypatch, mock_run_redis_dump, mock_device_info_no_voq, caplog): + caplog.set_level(logging.INFO) + monkeypatch.setattr(sys, "argv", ["chassis_db_consistency_checker.py"]) + rc = chassis_db_consistency_checker.main() + assert rc == 0 + expected_msg = "INFO root:chassis_db_consistency_checker.py:146 Not a voq chassis device. Exiting....." + assert caplog.text.strip() == expected_msg + + +def test_check_no_supervisor(monkeypatch, mock_run_redis_dump, mock_device_info_supervisor, caplog): + caplog.set_level(logging.INFO) + monkeypatch.setattr(sys, "argv", ["chassis_db_consistency_checker.py"]) + rc = chassis_db_consistency_checker.main() + assert rc == 0 + expected_msg = "INFO root:chassis_db_consistency_checker.py:150 Not supported on supervisor. Exiting...." + assert caplog.text.strip() == expected_msg + + +def test_no_mismatch(monkeypatch, mock_run_redis_dump, mock_multi_asic, mock_device_info): + # Ensure main sees predictable args + monkeypatch.setattr(sys, "argv", ["chassis_db_consistency_checker.py"]) + rc = chassis_db_consistency_checker.main() + assert rc == 0 + + +def test_no_mismatch_single_asic(monkeypatch, mock_run_redis_dump, mock_single_asic, mock_device_info): + # Ensure main sees predictable args + monkeypatch.setattr(sys, "argv", ["chassis_db_consistency_checker.py"]) + rc = chassis_db_consistency_checker.main() + assert rc == 0 + + +def test_with_mismatch(monkeypatch, mock_run_redis_dump_mismatch, mock_multi_asic, mock_device_info, caplog): + caplog.set_level(logging.CRITICAL) + monkeypatch.setattr(sys, "argv", ["chassis_db_consistency_checker.py"]) + rc = chassis_db_consistency_checker.main() + assert caplog.text.strip() == MULTI_ASIC_MISMATCH_LOGS.strip() + assert rc == -1 + + +def test_with_mismatch_single_asic(monkeypatch, mock_run_redis_dump_mismatch, + mock_single_asic, mock_device_info, caplog): + caplog.set_level(logging.CRITICAL) + monkeypatch.setattr(sys, "argv", ["chassis_db_consistency_checker.py"]) + rc = chassis_db_consistency_checker.main() + # The following line is within 120 columns + assert caplog.text.strip() == SINGLE_ASIC_MISMATCH_LOGS.strip() + assert rc == -1 + + +def test_redis_dump_no_output(monkeypatch, run_redis_dump_empty, + mock_multi_asic, mock_device_info, caplog): + caplog.set_level(logging.ERROR) + monkeypatch.setattr(sys, "argv", ["chassis_db_consistency_checker.py"]) + rc = chassis_db_consistency_checker.main() + assert rc == -1 + expected_msg = ( + "ERROR root:chassis_db_consistency_checker.py:103 " + "No SYSTEM_LAG_ID_TABLE found in chassis_db" + ) + assert caplog.text.strip() == expected_msg + + +def test_run_redis_dump_failure(monkeypatch): + class FakeCompleted: + def __init__(self, stdout="", stderr="", returncode=1): + self.stdout = stdout + self.stderr = stderr + self.returncode = returncode + + def fake_run(cmd_args, capture_output=True, text=True): + return FakeCompleted(stdout="", stderr="error", returncode=1) + + monkeypatch.setattr(chassis_db_consistency_checker.subprocess, "run", fake_run) + out = chassis_db_consistency_checker.run_redis_dump(["redis-dump", "-d", "1", "-y"]) + assert out == {} # script logs and returns {} + + +def test_run_redis_dump(monkeypatch, caplog): + # Create a fake CompletedProcess-like object + class FakeCompleted: + def __init__(self, stdout="", stderr="", returncode=0): + self.stdout = stdout + self.stderr = stderr + self.returncode = returncode + + # Define behavior based on args + def fake_run(cmd_args, capture_output=True, text=True): + assert capture_output and text + if "-k" in cmd_args and "*SAI_OBJECT_TYPE_LAG:*" in cmd_args: + payload = {"SAI_OBJECT_TYPE_LAG:oid:1": { + "value": {"SAI_LAG_ATTR_SYSTEM_PORT_AGGREGATE_ID": "100"}}} + return FakeCompleted(stdout=json.dumps(payload)) + elif "-k" in cmd_args and "SYSTEM_LAG_ID_TABLE" in cmd_args: + payload = {"SYSTEM_LAG_ID_TABLE": {"value": {"hostname|asic0|PortChannel1": "100"}}} + return FakeCompleted(stdout=json.dumps(payload)) + return FakeCompleted(stdout="{}", returncode=0) + + # Patch subprocess.run + monkeypatch.setattr(chassis_db_consistency_checker.subprocess, "run", fake_run) + + # Now call script functions; they will use the mocked run + table = chassis_db_consistency_checker.get_chassis_lag_db_table() + assert table == {"hostname|asic0|PortChannel1": "100"}