From 182897cd36a87b9ca9d0c975827b3675f15ede5e Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Fri, 3 Nov 2023 14:44:44 +0200 Subject: [PATCH 1/9] ASIC/SDK health event Signed-off-by: Stephen Sun --- clear/main.py | 8 ++ config/main.py | 78 ++++++++++ show/main.py | 46 ++++++ .../config_db.json | 8 ++ .../asic_sdk_health_event_input/state_db.json | 19 +++ .../state_db_no_event.json | 9 ++ .../state_db_no_fatal.json | 13 ++ .../state_db_no_notice.json | 13 ++ .../state_db_no_warning.json | 12 ++ tests/asic_sdk_health_event_test.py | 133 ++++++++++++++++++ 10 files changed, 339 insertions(+) create mode 100644 tests/asic_sdk_health_event_input/config_db.json create mode 100644 tests/asic_sdk_health_event_input/state_db.json create mode 100644 tests/asic_sdk_health_event_input/state_db_no_event.json create mode 100644 tests/asic_sdk_health_event_input/state_db_no_fatal.json create mode 100644 tests/asic_sdk_health_event_input/state_db_no_notice.json create mode 100644 tests/asic_sdk_health_event_input/state_db_no_warning.json create mode 100644 tests/asic_sdk_health_event_test.py diff --git a/clear/main.py b/clear/main.py index d09153533b..e5e9fd8d9c 100755 --- a/clear/main.py +++ b/clear/main.py @@ -550,6 +550,14 @@ def route(prefix, vrf, namespace): helper = util_base.UtilHelper() helper.load_and_register_plugins(plugins, cli) +# ("sonic-clear asic-sdk-health-event") +@cli.command() +@clicommon.pass_db +def asic_sdk_health_event(db): + keys = db.db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*") + for key in keys: + db.db.delete(db.db.STATE_DB, key); + if __name__ == '__main__': cli() diff --git a/config/main.py b/config/main.py index 65b669be7f..15c0fe79eb 100644 --- a/config/main.py +++ b/config/main.py @@ -7374,5 +7374,83 @@ def date(date, time): clicommon.run_command(['timedatectl', 'set-time', date_time]) +# +# 'asic-sdk-health-event' group ('config asic-sdk-health-event ...') +# +@config.group() +@click.pass_context +def asic_sdk_health_event(ctx): + """Configuring asic-sdk-health-event""" + pass + + +@asic_sdk_health_event.group() +@clicommon.pass_db +def suppress(db): + """Suppress ASIC/SDK health event""" + pass + + +def handle_asic_sdk_health_suppress_category_list(db, severity, category_list): + ctx = click.get_current_context() + + state_db = SonicV2Connector(host='127.0.0.1') + state_db.connect(state_db.STATE_DB, False) + entry_name="SWITCH_CAPABILITY|switch" + if "true" != state_db.get(state_db.STATE_DB, entry_name, "ASIC_SDK_HEALTH_EVENT"): + ctx.fail("ASIC/SDK health event is not supported on the platform") + + severityCapabilities = { + "fatal": "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY", + "warning": "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY", + "notice": "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY" + } + if "true" != state_db.get(state_db.STATE_DB, entry_name, severityCapabilities[severity]): + ctx.fail("Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity)) + + categories = {"software", "firmware", "cpu_hw", "asic_hw"} + + if category_list == 'none': + suppressedCategoriesList = [] + elif category_list == 'all': + suppressedCategoriesList = list(categories) + else: + suppressedCategoriesList = category_list.split(',') + + unsupportCategories = set(suppressedCategoriesList) - categories + if unsupportCategories: + ctx.fail("Invalid category(ies): {}".format(unsupportCategories)) + + cfgdb_clients = db.cfgdb_clients + + for ns, config_db in cfgdb_clients.items(): + + if suppressedCategoriesList: + config_db.mod_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, {"categories": suppressedCategoriesList}) + else: + config_db.mod_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, None) + + +@suppress.command() +@click.argument('category-list', required=True) +@clicommon.pass_db +def fatal(db, category_list): + handle_asic_sdk_health_suppress_category_list(db, 'fatal', category_list) + + +@suppress.command() +@click.argument('category-list', required=True) +@clicommon.pass_db +def warning(db, category_list): + handle_asic_sdk_health_suppress_category_list(db, 'warning', category_list) + + +@suppress.command() +@click.argument('category-list', required=True) +@clicommon.pass_db +def notice(db, category_list): + handle_asic_sdk_health_suppress_category_list(db, 'notice', category_list) + + if __name__ == '__main__': config() diff --git a/show/main.py b/show/main.py index 725556e6e8..2a6ee07ad7 100755 --- a/show/main.py +++ b/show/main.py @@ -2139,6 +2139,52 @@ def suppress_pending_fib(db): click.echo(state) +# asic-sdk-health-event subcommand ("show asic-sdk-health-event") +@cli.group(cls=clicommon.AliasedGroup) +def asic_sdk_health_event(): + """""" + pass + + +@asic_sdk_health_event.command() +@clicommon.pass_db +def suppressed_category_list(db): + """""" + if "true" != db.db.get(db.db.STATE_DB, "SWITCH_CAPABILITY|switch", "ASIC_SDK_HEALTH_EVENT"): + ctx = click.get_current_context() + ctx.fail("ASIC/SDK health event is not supported on the platform") + + suppressSeverities = db.cfgdb.get_table('SUPPRESS_ASIC_SDK_HEALTH_EVENT') + header = ['Severity', 'Suppressed category-list'] + body = [] + + for severity in natsorted(suppressSeverities): + body.append([severity, ','.join(suppressSeverities[severity]['categories'])]) + + click.echo(tabulate(body, header)) + + +@asic_sdk_health_event.command() +@clicommon.pass_db +def received(db): + """""" + if "true" != db.db.get(db.db.STATE_DB, "SWITCH_CAPABILITY|switch", "ASIC_SDK_HEALTH_EVENT"): + ctx = click.get_current_context() + ctx.fail("ASIC/SDK health event is not supported on the platform") + + event_keys = db.db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE|*") + delimiter = db.db.get_db_separator(db.db.STATE_DB) + + header = ['Date', 'ASICID', 'Severity', 'Category', 'Description'] + body = [] + + for key in natsorted(event_keys): + event = db.db.get_all(db.db.STATE_DB, key) + body.append([key.split('|')[1], event.get('asic_id'), event.get('severity'), event.get('category'), event.get('description')]) + + click.echo(tabulate(body, header)) + + # Load plugins and register them helper = util_base.UtilHelper() helper.load_and_register_plugins(plugins, cli) diff --git a/tests/asic_sdk_health_event_input/config_db.json b/tests/asic_sdk_health_event_input/config_db.json new file mode 100644 index 0000000000..fb929701be --- /dev/null +++ b/tests/asic_sdk_health_event_input/config_db.json @@ -0,0 +1,8 @@ +{ + "SUPPRESS_ASIC_SDK_HEALTH_EVENT|fatal": { + "categories@": "software" + }, + "SUPPRESS_ASIC_SDK_HEALTH_EVENT|warning": { + "categories@": "firmware,asic_hw" + } +} diff --git a/tests/asic_sdk_health_event_input/state_db.json b/tests/asic_sdk_health_event_input/state_db.json new file mode 100644 index 0000000000..1eb80bcd81 --- /dev/null +++ b/tests/asic_sdk_health_event_input/state_db.json @@ -0,0 +1,19 @@ +{ + "SWITCH_CAPABILITY|switch": { + "MIRROR": "true", + "MIRRORV6": "true", + "PORT_TPID_CAPABLE": "true", + "LAG_TPID_CAPABLE": "true", + "ACL_ACTION|PACKET_ACTION": "FORWARD", + "ASIC_SDK_HEALTH_EVENT": "true", + "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY": "true", + "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY": "true", + "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY": "true" + }, + "ASIC_SDK_HEALTH_EVENT_TABLE|2023-11-22 09:18:12": { + "asic_id": "0", + "severity": "fatal", + "description": "ASIC SDK health event", + "category": "firmware" + } +} diff --git a/tests/asic_sdk_health_event_input/state_db_no_event.json b/tests/asic_sdk_health_event_input/state_db_no_event.json new file mode 100644 index 0000000000..9425fce730 --- /dev/null +++ b/tests/asic_sdk_health_event_input/state_db_no_event.json @@ -0,0 +1,9 @@ +{ + "SWITCH_CAPABILITY|switch": { + "MIRROR": "true", + "MIRRORV6": "true", + "PORT_TPID_CAPABLE": "true", + "LAG_TPID_CAPABLE": "true", + "ACL_ACTION|PACKET_ACTION": "FORWARD" + } +} diff --git a/tests/asic_sdk_health_event_input/state_db_no_fatal.json b/tests/asic_sdk_health_event_input/state_db_no_fatal.json new file mode 100644 index 0000000000..540f1f488c --- /dev/null +++ b/tests/asic_sdk_health_event_input/state_db_no_fatal.json @@ -0,0 +1,13 @@ +{ + "SWITCH_CAPABILITY|switch": { + "MIRROR": "true", + "MIRRORV6": "true", + "PORT_TPID_CAPABLE": "true", + "LAG_TPID_CAPABLE": "true", + "ACL_ACTION|PACKET_ACTION": "FORWARD", + "ASIC_SDK_HEALTH_EVENT": "true", + "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY": "false", + "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY": "true", + "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY": "true" + } +} diff --git a/tests/asic_sdk_health_event_input/state_db_no_notice.json b/tests/asic_sdk_health_event_input/state_db_no_notice.json new file mode 100644 index 0000000000..82a46e4c95 --- /dev/null +++ b/tests/asic_sdk_health_event_input/state_db_no_notice.json @@ -0,0 +1,13 @@ +{ + "SWITCH_CAPABILITY|switch": { + "MIRROR": "true", + "MIRRORV6": "true", + "PORT_TPID_CAPABLE": "true", + "LAG_TPID_CAPABLE": "true", + "ACL_ACTION|PACKET_ACTION": "FORWARD", + "ASIC_SDK_HEALTH_EVENT": "true", + "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY": "true", + "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY": "true", + "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY": "unknown" + } +} diff --git a/tests/asic_sdk_health_event_input/state_db_no_warning.json b/tests/asic_sdk_health_event_input/state_db_no_warning.json new file mode 100644 index 0000000000..f654d3f1a6 --- /dev/null +++ b/tests/asic_sdk_health_event_input/state_db_no_warning.json @@ -0,0 +1,12 @@ +{ + "SWITCH_CAPABILITY|switch": { + "MIRROR": "true", + "MIRRORV6": "true", + "PORT_TPID_CAPABLE": "true", + "LAG_TPID_CAPABLE": "true", + "ACL_ACTION|PACKET_ACTION": "FORWARD", + "ASIC_SDK_HEALTH_EVENT": "true", + "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY": "true", + "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY": "true" + } +} diff --git a/tests/asic_sdk_health_event_test.py b/tests/asic_sdk_health_event_test.py new file mode 100644 index 0000000000..04d44ab54c --- /dev/null +++ b/tests/asic_sdk_health_event_test.py @@ -0,0 +1,133 @@ +import click +import config.main as config +import show.main as show +import clear.main as clear +import operator +import os +import pytest +import sys + +from click.testing import CliRunner +from .mock_tables import dbconnector +from utilities_common.db import Db + + +test_path = os.path.dirname(os.path.abspath(__file__)) +mock_db_path = os.path.join(test_path, "asic_sdk_health_event_input") +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + + +class TestAsicSdkHealthEvent(object): + @classmethod + def setup_class(cls): + print("SETUP") + + @classmethod + def teardown_class(cls): + print("TEARDOWN") + + @pytest.mark.parametrize("severity,categories", [ + ("fatal", "cpu_hw"), + ("warning", "asic_hw,software,firmware"), + ("notice", "cpu_hw,firmware") + ]) + def test_config_suppress_asic_sdk_health_event(self, severity, categories): + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') + + runner = CliRunner() + db = Db() + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "all"], obj=db) + assert result.exit_code == 0 + output_categories = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['categories'] + assert {'asic_hw', 'firmware', 'cpu_hw', 'software'} == set(output_categories) + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, categories], obj=db) + assert result.exit_code == 0 + output_categories = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['categories'] + assert set(categories.split(',')) == set(output_categories) + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "none"], obj=db) + assert result.exit_code == 0 + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "unknown"], obj=db) + assert result.exit_code != 0 + assert "Invalid category(ies): {'unknown'}" in result.output + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) + + @pytest.mark.parametrize("severity", ["fatal", "warning", "notice"]) + def test_config_suppress_asic_sdk_health_event_unsupported_severity(self, severity): + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db_no_' + severity) + + runner = CliRunner() + db = Db() + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "all"], obj=db) + assert result.exit_code != 0 + assert "Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity) in result.output + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) + + def test_config_suppress_asic_sdk_health_event_unsupported_event(self): + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db_no_event') + + runner = CliRunner() + db = Db() + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + ["fatal", "all"], obj=db) + assert result.exit_code != 0 + assert "ASIC/SDK health event is not supported on the platform" in result.output + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", "fatal") + + def test_show_asic_sdk_health_event_received(self): + expected_output = \ + "Date ASICID Severity Category Description\n" + "------------------- -------- ---------- ---------- ---------------------\n" + "2023-11-22 09:18:12 0 fatal firmware ASIC SDK health event\n" + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') + + runner = CliRunner() + db = Db() + + result = runner.invoke(show.cli.commands["asic-sdk-health-event"].commands["received"], [], obj=db) + assert result.exit_code == 0 + assert expected_output in result.output + + def test_show_asic_sdk_health_event_suppressed_category_list(self): + expected_output = \ + 'Severity Suppressed category-list\n' + '---------- --------------------------\n' + 'fatal software\n' + 'warning firmware,asic_hw\n' + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') + dbconnector.dedicated_dbs['CONFIG_DB'] = os.path.join(mock_db_path, 'config_db') + + runner = CliRunner() + db = Db() + + result = runner.invoke(show.cli.commands["asic-sdk-health-event"].commands["suppressed-category-list"], [], obj=db) + assert result.exit_code == 0 + assert expected_output in result.output + + def test_clear_suppress_asic_sdk_health_event(self): + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') + + runner = CliRunner() + db = Db() + + result = runner.invoke(clear.cli.commands["asic-sdk-health-event"], [], obj=db) + assert result.exit_code == 0 + assert not db.db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*") From 46782c249915fc13ae73126bb6fb93892f02ff05 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Fri, 8 Dec 2023 02:48:24 +0000 Subject: [PATCH 2/9] Add asic-sdk-health-event to show techsupport dump Signed-off-by: Stephen Sun --- scripts/generate_dump | 2 ++ tests/asic_sdk_health_event_test.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/scripts/generate_dump b/scripts/generate_dump index 849462fd36..ee9d311877 100755 --- a/scripts/generate_dump +++ b/scripts/generate_dump @@ -1869,6 +1869,8 @@ main() { # 1st counter snapshot early. Need 2 snapshots to make sense of counters trend. save_counter_snapshot $asic 1 + save_cmd "show asic-sdk-health-event received" "asic.sdk.health.event" & + save_cmd "systemd-analyze blame" "systemd.analyze.blame" & save_cmd "systemd-analyze dump" "systemd.analyze.dump" & save_cmd "systemd-analyze plot" "systemd.analyze.plot.svg" & diff --git a/tests/asic_sdk_health_event_test.py b/tests/asic_sdk_health_event_test.py index 04d44ab54c..9dba3315f4 100644 --- a/tests/asic_sdk_health_event_test.py +++ b/tests/asic_sdk_health_event_test.py @@ -26,6 +26,8 @@ def setup_class(cls): @classmethod def teardown_class(cls): print("TEARDOWN") + dbconnector.dedicated_dbs['STATE_DB'] = None + dbconnector.dedicated_dbs['CONFIG_DB'] = None @pytest.mark.parametrize("severity,categories", [ ("fatal", "cpu_hw"), From e37ec38031ea4e8f67265289df94462d257798bf Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Fri, 8 Dec 2023 10:17:13 +0000 Subject: [PATCH 3/9] Fix multi ASIC issue Signed-off-by: Stephen Sun --- config/main.py | 48 ++++++++++----- show/main.py | 93 ++++++++++++++++++++++------- tests/asic_sdk_health_event_test.py | 14 ++--- 3 files changed, 111 insertions(+), 44 deletions(-) diff --git a/config/main.py b/config/main.py index 15c0fe79eb..1fca620532 100644 --- a/config/main.py +++ b/config/main.py @@ -7391,22 +7391,19 @@ def suppress(db): pass -def handle_asic_sdk_health_suppress_category_list(db, severity, category_list): +def handle_asic_sdk_health_suppress_category_list(db, severity, category_list, namespace): ctx = click.get_current_context() - state_db = SonicV2Connector(host='127.0.0.1') - state_db.connect(state_db.STATE_DB, False) - entry_name="SWITCH_CAPABILITY|switch" - if "true" != state_db.get(state_db.STATE_DB, entry_name, "ASIC_SDK_HEALTH_EVENT"): - ctx.fail("ASIC/SDK health event is not supported on the platform") + if multi_asic.get_num_asics() > 1: + namespace_list = multi_asic.get_namespaces_from_linux() + else: + namespace_list = [DEFAULT_NAMESPACE] severityCapabilities = { "fatal": "REG_FATAL_ASIC_SDK_HEALTH_CATEGORY", "warning": "REG_WARNING_ASIC_SDK_HEALTH_CATEGORY", "notice": "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY" } - if "true" != state_db.get(state_db.STATE_DB, entry_name, severityCapabilities[severity]): - ctx.fail("Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity)) categories = {"software", "firmware", "cpu_hw", "asic_hw"} @@ -7421,9 +7418,19 @@ def handle_asic_sdk_health_suppress_category_list(db, severity, category_list): if unsupportCategories: ctx.fail("Invalid category(ies): {}".format(unsupportCategories)) - cfgdb_clients = db.cfgdb_clients + for ns in namespace_list: + if namespace and namespace != ns: + continue - for ns, config_db in cfgdb_clients.items(): + config_db = db.cfgdb_clients[ns] + state_db = db.db_clients[ns] + + entry_name="SWITCH_CAPABILITY|switch" + if "true" != state_db.get(state_db.STATE_DB, entry_name, "ASIC_SDK_HEALTH_EVENT"): + ctx.fail("ASIC/SDK health event is not supported on the platform") + + if "true" != state_db.get(state_db.STATE_DB, entry_name, severityCapabilities[severity]): + ctx.fail("Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity)) if suppressedCategoriesList: config_db.mod_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, {"categories": suppressedCategoriesList}) @@ -7433,23 +7440,32 @@ def handle_asic_sdk_health_suppress_category_list(db, severity, category_list): @suppress.command() @click.argument('category-list', required=True) +@click.option('--namespace', '-n', 'namespace', required=False, default=None, type=str, show_default=False, + help='Option needed for multi-asic only: provide namespace name', + callback=multi_asic_util.multi_asic_namespace_validation_callback) @clicommon.pass_db -def fatal(db, category_list): - handle_asic_sdk_health_suppress_category_list(db, 'fatal', category_list) +def fatal(db, category_list, namespace): + handle_asic_sdk_health_suppress_category_list(db, 'fatal', category_list, namespace) @suppress.command() @click.argument('category-list', required=True) +@click.option('--namespace', '-n', 'namespace', required=False, default=None, type=str, show_default=False, + help='Option needed for multi-asic only: provide namespace name', + callback=multi_asic_util.multi_asic_namespace_validation_callback) @clicommon.pass_db -def warning(db, category_list): - handle_asic_sdk_health_suppress_category_list(db, 'warning', category_list) +def warning(db, category_list, namespace): + handle_asic_sdk_health_suppress_category_list(db, 'warning', category_list, namespace) @suppress.command() @click.argument('category-list', required=True) +@click.option('--namespace', '-n', 'namespace', required=False, default=None, type=str, show_default=False, + help='Option needed for multi-asic only: provide namespace name', + callback=multi_asic_util.multi_asic_namespace_validation_callback) @clicommon.pass_db -def notice(db, category_list): - handle_asic_sdk_health_suppress_category_list(db, 'notice', category_list) +def notice(db, category_list, namespace): + handle_asic_sdk_health_suppress_category_list(db, 'notice', category_list, namespace) if __name__ == '__main__': diff --git a/show/main.py b/show/main.py index 2a6ee07ad7..9679438e35 100755 --- a/show/main.py +++ b/show/main.py @@ -71,6 +71,7 @@ PLATFORM_JSON = 'platform.json' HWSKU_JSON = 'hwsku.json' PORT_STR = "Ethernet" +DEFAULT_NAMESPACE = '' VLAN_SUB_INTERFACE_SEPARATOR = '.' @@ -2148,41 +2149,91 @@ def asic_sdk_health_event(): @asic_sdk_health_event.command() @clicommon.pass_db -def suppressed_category_list(db): - """""" - if "true" != db.db.get(db.db.STATE_DB, "SWITCH_CAPABILITY|switch", "ASIC_SDK_HEALTH_EVENT"): - ctx = click.get_current_context() - ctx.fail("ASIC/SDK health event is not supported on the platform") +@click.option('--namespace', '-n', 'namespace', default=None, show_default=True, + type=click.Choice(multi_asic_util.multi_asic_ns_choices()), help='Namespace name or all') +def suppressed_category_list(db, namespace): + """ Show the suppressed category list """ + if multi_asic.get_num_asics() > 1: + namespace_list = multi_asic.get_namespaces_from_linux() + masic = True + else: + namespace_list = [DEFAULT_NAMESPACE] + masic = False - suppressSeverities = db.cfgdb.get_table('SUPPRESS_ASIC_SDK_HEALTH_EVENT') header = ['Severity', 'Suppressed category-list'] body = [] - for severity in natsorted(suppressSeverities): - body.append([severity, ','.join(suppressSeverities[severity]['categories'])]) + supported = False - click.echo(tabulate(body, header)) + for ns in namespace_list: + if namespace and namespace != ns: + continue + state_db = db.db_clients[ns] + if "true" != state_db.get(db.db.STATE_DB, "SWITCH_CAPABILITY|switch", "ASIC_SDK_HEALTH_EVENT"): + continue -@asic_sdk_health_event.command() -@clicommon.pass_db -def received(db): - """""" - if "true" != db.db.get(db.db.STATE_DB, "SWITCH_CAPABILITY|switch", "ASIC_SDK_HEALTH_EVENT"): + supported = True + + if masic: + click.echo("{}:".format(ns)); + + config_db = db.cfgdb_clients[ns] + suppressSeverities = config_db.get_table('SUPPRESS_ASIC_SDK_HEALTH_EVENT') + + for severity in natsorted(suppressSeverities): + body.append([severity, ','.join(suppressSeverities[severity]['categories'])]) + + if supported: + click.echo(tabulate(body, header)) + else: ctx = click.get_current_context() ctx.fail("ASIC/SDK health event is not supported on the platform") - event_keys = db.db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE|*") - delimiter = db.db.get_db_separator(db.db.STATE_DB) - header = ['Date', 'ASICID', 'Severity', 'Category', 'Description'] +@asic_sdk_health_event.command() +@clicommon.pass_db +@click.option('--namespace', '-n', 'namespace', default=None, show_default=True, + type=click.Choice(multi_asic_util.multi_asic_ns_choices()), help='Namespace name or all') +def received(db, namespace): + """ Show the received ASIC/SDK health event """ + if multi_asic.get_num_asics() > 1: + namespace_list = multi_asic.get_namespaces_from_linux() + masic = True + else: + namespace_list = [DEFAULT_NAMESPACE] + masic = False + + header = ['Date', 'Severity', 'Category', 'Description'] body = [] - for key in natsorted(event_keys): - event = db.db.get_all(db.db.STATE_DB, key) - body.append([key.split('|')[1], event.get('asic_id'), event.get('severity'), event.get('category'), event.get('description')]) + supported = False - click.echo(tabulate(body, header)) + for ns in namespace_list: + if namespace and namespace != ns: + continue + + state_db = db.db_clients[ns] + if "true" != state_db.get(db.db.STATE_DB, "SWITCH_CAPABILITY|switch", "ASIC_SDK_HEALTH_EVENT"): + continue + + supported = True + + if masic: + click.echo("{}:".format(ns)); + + event_keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE|*") + delimiter = state_db.get_db_separator(db.db.STATE_DB) + + for key in natsorted(event_keys): + event = state_db.get_all(state_db.STATE_DB, key) + body.append([key.split('|')[1], event.get('severity'), event.get('category'), event.get('description')]) + + if supported: + click.echo(tabulate(body, header)) + else: + ctx = click.get_current_context() + ctx.fail("ASIC/SDK health event is not supported on the platform") # Load plugins and register them diff --git a/tests/asic_sdk_health_event_test.py b/tests/asic_sdk_health_event_test.py index 9dba3315f4..3127b69823 100644 --- a/tests/asic_sdk_health_event_test.py +++ b/tests/asic_sdk_health_event_test.py @@ -96,9 +96,9 @@ def test_config_suppress_asic_sdk_health_event_unsupported_event(self): def test_show_asic_sdk_health_event_received(self): expected_output = \ - "Date ASICID Severity Category Description\n" - "------------------- -------- ---------- ---------- ---------------------\n" - "2023-11-22 09:18:12 0 fatal firmware ASIC SDK health event\n" + "Date Severity Category Description\n" + "------------------- ---------- ---------- ---------------------\n" + "2023-11-22 09:18:12 fatal firmware ASIC SDK health event\n" dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') runner = CliRunner() @@ -110,10 +110,10 @@ def test_show_asic_sdk_health_event_received(self): def test_show_asic_sdk_health_event_suppressed_category_list(self): expected_output = \ - 'Severity Suppressed category-list\n' - '---------- --------------------------\n' - 'fatal software\n' - 'warning firmware,asic_hw\n' + "Severity Suppressed category-list\n" + "---------- --------------------------\n" + "fatal software\n" + "warning firmware,asic_hw\n" dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') dbconnector.dedicated_dbs['CONFIG_DB'] = os.path.join(mock_db_path, 'config_db') From 0f969b791cb37fe20d6361b123681b99ce9e4048 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Tue, 12 Dec 2023 10:33:05 +0000 Subject: [PATCH 4/9] Adjust for multi ASIC system Signed-off-by: Stephen Sun --- clear/main.py | 21 +++++- config/main.py | 12 ++-- doc/Command-Reference.md | 151 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 175 insertions(+), 9 deletions(-) diff --git a/clear/main.py b/clear/main.py index e5e9fd8d9c..a66c92d90b 100755 --- a/clear/main.py +++ b/clear/main.py @@ -5,6 +5,7 @@ import click import utilities_common.cli as clicommon import utilities_common.multi_asic as multi_asic_util +from sonic_py_common import multi_asic from sonic_py_common.general import getstatusoutput_noshell_pipe from flow_counter_util.route import exit_if_route_flow_counter_not_support from utilities_common import util_base @@ -12,6 +13,7 @@ from config.plugins.pbh import serialize_pbh_counters from . import plugins +DEFAULT_NAMESPACE = '' # This is from the aliases example: # https://github.com/pallets/click/blob/57c6f09611fc47ca80db0bd010f05998b3c0aa95/examples/aliases/aliases.py @@ -552,11 +554,24 @@ def route(prefix, vrf, namespace): # ("sonic-clear asic-sdk-health-event") @cli.command() +@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False, + help='Option needed for multi-asic only: provide namespace name', + type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db -def asic_sdk_health_event(db): - keys = db.db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*") +def asic_sdk_health_event(db, namespace): + if multi_asic.get_num_asics() > 1: + namespace_list = multi_asic.get_namespaces_from_linux() + else: + namespace_list = [DEFAULT_NAMESPACE] + + for ns in namespace_list: + if namespace and namespace != ns: + continue + + state_db = db.db_clients[ns] + keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*") for key in keys: - db.db.delete(db.db.STATE_DB, key); + state_db.delete(state_db.STATE_DB, key); if __name__ == '__main__': diff --git a/config/main.py b/config/main.py index 1fca620532..c5d50c79a2 100644 --- a/config/main.py +++ b/config/main.py @@ -7440,9 +7440,9 @@ def handle_asic_sdk_health_suppress_category_list(db, severity, category_list, n @suppress.command() @click.argument('category-list', required=True) -@click.option('--namespace', '-n', 'namespace', required=False, default=None, type=str, show_default=False, +@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False, help='Option needed for multi-asic only: provide namespace name', - callback=multi_asic_util.multi_asic_namespace_validation_callback) + type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db def fatal(db, category_list, namespace): handle_asic_sdk_health_suppress_category_list(db, 'fatal', category_list, namespace) @@ -7450,9 +7450,9 @@ def fatal(db, category_list, namespace): @suppress.command() @click.argument('category-list', required=True) -@click.option('--namespace', '-n', 'namespace', required=False, default=None, type=str, show_default=False, +@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False, help='Option needed for multi-asic only: provide namespace name', - callback=multi_asic_util.multi_asic_namespace_validation_callback) + type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db def warning(db, category_list, namespace): handle_asic_sdk_health_suppress_category_list(db, 'warning', category_list, namespace) @@ -7460,9 +7460,9 @@ def warning(db, category_list, namespace): @suppress.command() @click.argument('category-list', required=True) -@click.option('--namespace', '-n', 'namespace', required=False, default=None, type=str, show_default=False, +@click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False, help='Option needed for multi-asic only: provide namespace name', - callback=multi_asic_util.multi_asic_namespace_validation_callback) + type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db def notice(db, category_list, namespace): handle_asic_sdk_health_suppress_category_list(db, 'notice', category_list, namespace) diff --git a/doc/Command-Reference.md b/doc/Command-Reference.md index 30f317be80..070bf001f9 100644 --- a/doc/Command-Reference.md +++ b/doc/Command-Reference.md @@ -29,6 +29,10 @@ * [ARP & NDP](#arp--ndp) * [ARP show commands](#arp-show-commands) * [NDP show commands](#ndp-show-commands) +* [ASIC SDK health event](#asic-sdk-health-event) + * [ASIC SDK health event config commands](#asic-sdk-health-event-config-commands) + * [ASIC SDK health event show commands](#asic-sdk-health-event-show-commands) + * [ASIC SDK health event clear commands](#asic-sdk-health-event-clear-commands) * [BFD](#bfd) * [BFD show commands](#bfd-show-commands) * [BGP](#bgp) @@ -1928,6 +1932,153 @@ This command is used to display: ACL rules, tables and their priority, ACL packe If the `PACKETS COUNT` and `BYTES COUNT` fields have some numeric value it means that it is a SONiC ACL's and those counters are created in SONiC `COUNTERS_DB`. +## ASIC SDK health event + +### ASIC SDK health event config commands + +**config asic-sdk-health-event suppress ** + +This command is for a customer to configure the categories that he/she wants to suppress for a certain severity. + +- Usage: + ``` + config config asic-sdk-health-event suppress || + ``` + + - Parameters: + - severity: Specify the severity whose ASIC/SDK health events to be suppressed. It can be one of `fatal`, `warning`, and `notice`. + - category-list: Specify the categories from which the ASIC/SDK health events to be suppressed. It is a list whose element is one of `software`, `firmware`, `cpu_hw`, `asic_hw` separated by a comma. + If the category-list is `none`, none category is suppressed and all the categories will be notified for `severity`. + If the category-list is `all`, all the categories are suppressed and none category will be notified for `severity`. + +- Examples: + ``` + admin@sonic:~$ sudo config asic-sdk-health-event suppress fatal cpu_hw,software + ``` + + This command will suppress ASIC/SDK health events whose severity is fatal and cagetory is cpu_hw or software. + +### ASIC SDK health event show commands + +**show asic-sdk-health-event received** + +This command displays the received ASIC/SDK health events. + +- Usage: + ``` + show asic-sdk-health-event received [-n ] + ``` + +- Details: + - show asic-sdk-health-event received: Display the ASIC/SDK health events received on all ASICs + - show asic-sdk-health-event received -n asic0: Display all the ASIC/SDK health events received on asic0 + + +- Example: + ``` + admin@sonic:~$ show asic-sdk-health-event received + Time Severity Category Description + ------------------- ----------- --------- ----------------- + 2023-10-20 05:07:34 fatal firmware Command timeout + 2023-10-20 03:06:25 fatal software SDK daemon keep alive failed + 2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error + 2023-10-20 01:58:43 notice asic_hw Correctable ECC error + ``` + +- Example on a multi ASIC system: + ``` + admin@sonic:~$ show asic-sdk-health-event received + asic0: + Time Severity Category Description + ------------------- ----------- --------- ----------------- + 2023-10-20 05:07:34 fatal firmware Command timeout + 2023-10-20 03:06:25 fatal software SDK daemon keep alive failed + asic1: + Time Severity Category Description + ------------------- ----------- --------- ----------------- + 2023-10-20 05:07:34 fatal asic_hw Uncorrectable ECC error + 2023-10-20 01:58:43 notice asic_hw Correctable ECC error + ``` + +Optionally, you can specify the asic name in order to display the ASIC/SDK health events received on that particular ASIC on a multi ASIC system + +- Example: + ``` + admin@sonic:~$ show asic-sdk-health-event received -n asic1 + asic1: + Time Severity Category Description + ------------------- ----------- --------- ----------------- + 2023-10-20 05:07:34 fatal firmware Command timeout + ``` + +**show asic-sdk-health-event suppressed-category-list** + +This command displays the suppressed category list of ASIC/SDK health events. + +- Usage: + ``` + show asic-sdk-health-event suppressed-category-list [-n ] + ``` + +- Details: + - show asic-sdk-health-event suppressed-category-list: Display the ASIC/SDK health event suppress category list on all ASICs + - show asic-sdk-health-event suppressed-category-list -n asic0: Display all the ASIC/SDK health event suppress category list on asic0 + + +- Example: + ``` + admin@sonic:~$ show asic-sdk-health-event suppressed-category-list + Severity Suppressed category-list + ---------- -------------------------- + notice asic_hw,cpu_hw + ``` + +- Example on a multi ASIC system: + ``` + admin@sonic:~$ show asic-sdk-health-event suppressed-category-list + asic0: + Severity Suppressed category-list + ---------- -------------------------- + notice asic_hw + asic1: + Severity Suppressed category-list + ---------- -------------------------- + notice cpu_hw + ``` + +Optionally, you can specify the asic name in order to display the ASIC/SDK health event suppress category list on that particular ASIC on a multi ASIC system + +- Example: + ``` + admin@sonic:~$ show asic-sdk-health-event suppressed-category-list -n asic1 + asic1: + Severity Suppressed category-list + ---------- -------------------------- + notice cpu_hw + ``` + +### ASIC SDK health event clear commands + +**sonic-clear asic-sdk-health-event** + +This command clears all the received ASIC/SDK health events. + +- Usage: + ``` + sonic-clear asic-sdk-health-event [-n ] + ``` + +- Details: + - sonic-clear asic-sdk-health-event: Clear the ASIC/SDK health events received on all ASICs + - sonic-clear asic-sdk-health-event -n asic0: Display all the ASIC/SDK health events received on asic0 + + +- Example: + ``` + admin@sonic:~$ sonic-clear asic-sdk-health-event + ``` + +Go Back To [Beginning of the document](#) or [Beginning of this section](#asic-sdk-health-event) ## ARP & NDP From 9f3e095b08ad9860103b15308549159c9db8756e Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Tue, 12 Dec 2023 11:43:30 +0000 Subject: [PATCH 5/9] Fix internal review comments Signed-off-by: Stephen Sun --- clear/main.py | 13 ++++++------- config/main.py | 8 +++----- show/main.py | 6 ++---- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/clear/main.py b/clear/main.py index a66c92d90b..5ffcd2dba4 100755 --- a/clear/main.py +++ b/clear/main.py @@ -13,8 +13,6 @@ from config.plugins.pbh import serialize_pbh_counters from . import plugins -DEFAULT_NAMESPACE = '' - # This is from the aliases example: # https://github.com/pallets/click/blob/57c6f09611fc47ca80db0bd010f05998b3c0aa95/examples/aliases/aliases.py class Config(object): @@ -559,19 +557,20 @@ def route(prefix, vrf, namespace): type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db def asic_sdk_health_event(db, namespace): + """Clear received ASIC/SDK health events""" if multi_asic.get_num_asics() > 1: namespace_list = multi_asic.get_namespaces_from_linux() else: - namespace_list = [DEFAULT_NAMESPACE] + namespace_list = [multi_asic.DEFAULT_NAMESPACE] for ns in namespace_list: if namespace and namespace != ns: continue - state_db = db.db_clients[ns] - keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*") - for key in keys: - state_db.delete(state_db.STATE_DB, key); + state_db = db.db_clients[ns] + keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE*") + for key in keys: + state_db.delete(state_db.STATE_DB, key); if __name__ == '__main__': diff --git a/config/main.py b/config/main.py index c5d50c79a2..e532f6f5c9 100644 --- a/config/main.py +++ b/config/main.py @@ -7378,15 +7378,13 @@ def date(date, time): # 'asic-sdk-health-event' group ('config asic-sdk-health-event ...') # @config.group() -@click.pass_context -def asic_sdk_health_event(ctx): +def asic_sdk_health_event(): """Configuring asic-sdk-health-event""" pass @asic_sdk_health_event.group() -@clicommon.pass_db -def suppress(db): +def suppress(): """Suppress ASIC/SDK health event""" pass @@ -7425,7 +7423,7 @@ def handle_asic_sdk_health_suppress_category_list(db, severity, category_list, n config_db = db.cfgdb_clients[ns] state_db = db.db_clients[ns] - entry_name="SWITCH_CAPABILITY|switch" + entry_name = "SWITCH_CAPABILITY|switch" if "true" != state_db.get(state_db.STATE_DB, entry_name, "ASIC_SDK_HEALTH_EVENT"): ctx.fail("ASIC/SDK health event is not supported on the platform") diff --git a/show/main.py b/show/main.py index 9679438e35..53d0d19782 100755 --- a/show/main.py +++ b/show/main.py @@ -71,7 +71,6 @@ PLATFORM_JSON = 'platform.json' HWSKU_JSON = 'hwsku.json' PORT_STR = "Ethernet" -DEFAULT_NAMESPACE = '' VLAN_SUB_INTERFACE_SEPARATOR = '.' @@ -2157,7 +2156,7 @@ def suppressed_category_list(db, namespace): namespace_list = multi_asic.get_namespaces_from_linux() masic = True else: - namespace_list = [DEFAULT_NAMESPACE] + namespace_list = [multi_asic.DEFAULT_NAMESPACE] masic = False header = ['Severity', 'Suppressed category-list'] @@ -2201,7 +2200,7 @@ def received(db, namespace): namespace_list = multi_asic.get_namespaces_from_linux() masic = True else: - namespace_list = [DEFAULT_NAMESPACE] + namespace_list = [multi_asic.DEFAULT_NAMESPACE] masic = False header = ['Date', 'Severity', 'Category', 'Description'] @@ -2223,7 +2222,6 @@ def received(db, namespace): click.echo("{}:".format(ns)); event_keys = state_db.keys(db.db.STATE_DB, "ASIC_SDK_HEALTH_EVENT_TABLE|*") - delimiter = state_db.get_db_separator(db.db.STATE_DB) for key in natsorted(event_keys): event = state_db.get_all(state_db.STATE_DB, key) From 4013b5b6793ebdf61cb1fd13f894c9516257bdba Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Wed, 10 Jan 2024 05:33:52 +0000 Subject: [PATCH 6/9] Address community review comments Eliminate oldest events if the number of events exceed the threshold Signed-off-by: Stephen Sun --- config/main.py | 77 +++++++++++++------ doc/Command-Reference.md | 51 ++++++------ show/main.py | 10 ++- .../config_db.json | 6 +- tests/asic_sdk_health_event_test.py | 72 ++++++++++++++--- 5 files changed, 153 insertions(+), 63 deletions(-) diff --git a/config/main.py b/config/main.py index e532f6f5c9..50a94bfd32 100644 --- a/config/main.py +++ b/config/main.py @@ -7389,7 +7389,7 @@ def suppress(): pass -def handle_asic_sdk_health_suppress_category_list(db, severity, category_list, namespace): +def handle_asic_sdk_health_suppress(db, severity, category_list, max_events, namespace): ctx = click.get_current_context() if multi_asic.get_num_asics() > 1: @@ -7403,18 +7403,27 @@ def handle_asic_sdk_health_suppress_category_list(db, severity, category_list, n "notice": "REG_NOTICE_ASIC_SDK_HEALTH_CATEGORY" } - categories = {"software", "firmware", "cpu_hw", "asic_hw"} + if category_list: + categories = {"software", "firmware", "cpu_hw", "asic_hw"} - if category_list == 'none': - suppressedCategoriesList = [] - elif category_list == 'all': - suppressedCategoriesList = list(categories) - else: - suppressedCategoriesList = category_list.split(',') + if category_list == 'none': + suppressedCategoriesList = [] + elif category_list == 'all': + suppressedCategoriesList = list(categories) + else: + suppressedCategoriesList = category_list.split(',') + + unsupportCategories = set(suppressedCategoriesList) - categories + if unsupportCategories: + ctx.fail("Invalid category(ies): {}".format(unsupportCategories)) - unsupportCategories = set(suppressedCategoriesList) - categories - if unsupportCategories: - ctx.fail("Invalid category(ies): {}".format(unsupportCategories)) + if max_events: + try: + max_events_number = int(max_events) + if max_events_number < 0: + ctx.fail("Invalid max-events: {}".format(max_events)) + except ValueError as e: + ctx.fail("Invalid max-events: {}".format(max_events)) for ns in namespace_list: if namespace and namespace != ns: @@ -7430,40 +7439,60 @@ def handle_asic_sdk_health_suppress_category_list(db, severity, category_list, n if "true" != state_db.get(state_db.STATE_DB, entry_name, severityCapabilities[severity]): ctx.fail("Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity)) - if suppressedCategoriesList: - config_db.mod_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, {"categories": suppressedCategoriesList}) - else: - config_db.mod_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, None) + entry = config_db.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) + need_remove = False + + if category_list: + if suppressedCategoriesList: + entry["categories"] = suppressedCategoriesList + elif entry.get("categories"): + entry.pop("categories") + need_remove = True + + if max_events is not None: + if max_events > 0: + entry["max_events"] = max_events + elif entry.get("max_events"): + entry.pop("max_events") + need_remove = True + + if entry: + config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, entry) + elif need_remove: + config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, None) @suppress.command() -@click.argument('category-list', required=True) +@click.option('--category-list', metavar='', type=str, help="Categories to be suppressed") +@click.option('--max-events', metavar='', type=click.IntRange(0), help="Maximum number of received events") @click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False, help='Option needed for multi-asic only: provide namespace name', type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db -def fatal(db, category_list, namespace): - handle_asic_sdk_health_suppress_category_list(db, 'fatal', category_list, namespace) +def fatal(db, category_list, max_events, namespace): + handle_asic_sdk_health_suppress(db, 'fatal', category_list, max_events, namespace) @suppress.command() -@click.argument('category-list', required=True) +@click.option('--category-list', metavar='', type=str, help="Categories to be suppressed") +@click.option('--max-events', metavar='', type=click.IntRange(0), help="Maximum number of received events") @click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False, help='Option needed for multi-asic only: provide namespace name', type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db -def warning(db, category_list, namespace): - handle_asic_sdk_health_suppress_category_list(db, 'warning', category_list, namespace) +def warning(db, category_list, max_events, namespace): + handle_asic_sdk_health_suppress(db, 'warning', category_list, max_events, namespace) @suppress.command() -@click.argument('category-list', required=True) +@click.option('--category-list', metavar='', type=str, help="Categories to be suppressed") +@click.option('--max-events', metavar='', type=click.IntRange(0), help="Maximum number of received events") @click.option('--namespace', '-n', 'namespace', required=False, default=None, show_default=False, help='Option needed for multi-asic only: provide namespace name', type=click.Choice(multi_asic_util.multi_asic_ns_choices())) @clicommon.pass_db -def notice(db, category_list, namespace): - handle_asic_sdk_health_suppress_category_list(db, 'notice', category_list, namespace) +def notice(db, category_list, max_events, namespace): + handle_asic_sdk_health_suppress(db, 'notice', category_list, max_events, namespace) if __name__ == '__main__': diff --git a/doc/Command-Reference.md b/doc/Command-Reference.md index 070bf001f9..d4ddeb9940 100644 --- a/doc/Command-Reference.md +++ b/doc/Command-Reference.md @@ -1942,21 +1942,23 @@ This command is for a customer to configure the categories that he/she wants to - Usage: ``` - config config asic-sdk-health-event suppress || + config config asic-sdk-health-event suppress [--category-list ||] [--max-events ] ``` - Parameters: - severity: Specify the severity whose ASIC/SDK health events to be suppressed. It can be one of `fatal`, `warning`, and `notice`. - category-list: Specify the categories from which the ASIC/SDK health events to be suppressed. It is a list whose element is one of `software`, `firmware`, `cpu_hw`, `asic_hw` separated by a comma. - If the category-list is `none`, none category is suppressed and all the categories will be notified for `severity`. + If the category-list is `none`, none category is suppressed and all the categories will be notified for `severity`. In this case, it will not be stored in the CONFIG_DB. If the category-list is `all`, all the categories are suppressed and none category will be notified for `severity`. + - max-events: Specify the maximum number of events of the severity to be stored in the STATE_DB. + There is no limitation if the max-events is 0. In this case, it will not be stored in the CONFIG_DB. - Examples: ``` - admin@sonic:~$ sudo config asic-sdk-health-event suppress fatal cpu_hw,software + admin@sonic:~$ sudo config asic-sdk-health-event suppress fatal --category-list cpu_hw,software --max-events 10240 ``` - This command will suppress ASIC/SDK health events whose severity is fatal and cagetory is cpu_hw or software. + This command will suppress ASIC/SDK health events whose severity is fatal and cagetory is cpu_hw or software. Maximum number of such events in the STATE_DB is 10240. ### ASIC SDK health event show commands @@ -2011,9 +2013,9 @@ Optionally, you can specify the asic name in order to display the ASIC/SDK healt 2023-10-20 05:07:34 fatal firmware Command timeout ``` -**show asic-sdk-health-event suppressed-category-list** +**show asic-sdk-health-event suppress-configuration** -This command displays the suppressed category list of ASIC/SDK health events. +This command displays the suppressed category list and maximum number of events of ASIC/SDK health events. - Usage: ``` @@ -2021,40 +2023,43 @@ This command displays the suppressed category list of ASIC/SDK health events. ``` - Details: - - show asic-sdk-health-event suppressed-category-list: Display the ASIC/SDK health event suppress category list on all ASICs - - show asic-sdk-health-event suppressed-category-list -n asic0: Display all the ASIC/SDK health event suppress category list on asic0 + - show asic-sdk-health-event suppress-configuration: Display the ASIC/SDK health event suppress category list and maximum number of events on all ASICs + - show asic-sdk-health-event suppress-configuration -n asic0: Display all the ASIC/SDK health event suppress category list and maximum number of events on asic0 - Example: ``` - admin@sonic:~$ show asic-sdk-health-event suppressed-category-list - Severity Suppressed category-list - ---------- -------------------------- - notice asic_hw,cpu_hw + admin@sonic:~$ show asic-sdk-health-event suppress-configuration + Severity Suppressed category-list Max events + ---------- -------------------------- ------------ + fatal software unlimited + notice none 1024 + warning firmware,asic_hw 10240 ``` - Example on a multi ASIC system: ``` - admin@sonic:~$ show asic-sdk-health-event suppressed-category-list + admin@sonic:~$ show asic-sdk-health-event suppress-configuration asic0: - Severity Suppressed category-list - ---------- -------------------------- - notice asic_hw + Severity Suppressed category-list Max events + ---------- -------------------------- ------------ + notice none 1024 + warning firmware,asic_hw 10240 asic1: - Severity Suppressed category-list - ---------- -------------------------- - notice cpu_hw + Severity Suppressed category-list Max events + ---------- -------------------------- ------------ + fatal software unlimited ``` Optionally, you can specify the asic name in order to display the ASIC/SDK health event suppress category list on that particular ASIC on a multi ASIC system - Example: ``` - admin@sonic:~$ show asic-sdk-health-event suppressed-category-list -n asic1 + admin@sonic:~$ show asic-sdk-health-event suppress-configuration -n asic1 asic1: - Severity Suppressed category-list - ---------- -------------------------- - notice cpu_hw + Severity Suppressed category-list Max events + ---------- -------------------------- ------------ + fatal software unlimited ``` ### ASIC SDK health event clear commands diff --git a/show/main.py b/show/main.py index 53d0d19782..9740711f33 100755 --- a/show/main.py +++ b/show/main.py @@ -2150,8 +2150,8 @@ def asic_sdk_health_event(): @clicommon.pass_db @click.option('--namespace', '-n', 'namespace', default=None, show_default=True, type=click.Choice(multi_asic_util.multi_asic_ns_choices()), help='Namespace name or all') -def suppressed_category_list(db, namespace): - """ Show the suppressed category list """ +def suppress_configuration(db, namespace): + """ Show the suppress configuration """ if multi_asic.get_num_asics() > 1: namespace_list = multi_asic.get_namespaces_from_linux() masic = True @@ -2159,7 +2159,7 @@ def suppressed_category_list(db, namespace): namespace_list = [multi_asic.DEFAULT_NAMESPACE] masic = False - header = ['Severity', 'Suppressed category-list'] + header = ['Severity', 'Suppressed category-list', "Max events"] body = [] supported = False @@ -2181,7 +2181,9 @@ def suppressed_category_list(db, namespace): suppressSeverities = config_db.get_table('SUPPRESS_ASIC_SDK_HEALTH_EVENT') for severity in natsorted(suppressSeverities): - body.append([severity, ','.join(suppressSeverities[severity]['categories'])]) + body.append([severity, + ','.join(suppressSeverities[severity].get('categories', ['none'])), + suppressSeverities[severity].get('max_events', 'unlimited')]) if supported: click.echo(tabulate(body, header)) diff --git a/tests/asic_sdk_health_event_input/config_db.json b/tests/asic_sdk_health_event_input/config_db.json index fb929701be..251f443c3d 100644 --- a/tests/asic_sdk_health_event_input/config_db.json +++ b/tests/asic_sdk_health_event_input/config_db.json @@ -3,6 +3,10 @@ "categories@": "software" }, "SUPPRESS_ASIC_SDK_HEALTH_EVENT|warning": { - "categories@": "firmware,asic_hw" + "categories@": "firmware,asic_hw", + "max_events": "10240" + }, + "SUPPRESS_ASIC_SDK_HEALTH_EVENT|notice": { + "max_events": "1024" } } diff --git a/tests/asic_sdk_health_event_test.py b/tests/asic_sdk_health_event_test.py index 3127b69823..3d708c8460 100644 --- a/tests/asic_sdk_health_event_test.py +++ b/tests/asic_sdk_health_event_test.py @@ -42,31 +42,80 @@ def test_config_suppress_asic_sdk_health_event(self, severity, categories): result = runner.invoke( config.config.commands["asic-sdk-health-event"].commands["suppress"], - [severity, "all"], obj=db) + [severity, "--category-list", "all"], obj=db) assert result.exit_code == 0 output_categories = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['categories'] assert {'asic_hw', 'firmware', 'cpu_hw', 'software'} == set(output_categories) result = runner.invoke( config.config.commands["asic-sdk-health-event"].commands["suppress"], - [severity, categories], obj=db) + [severity, "--category-list", categories], obj=db) assert result.exit_code == 0 output_categories = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['categories'] assert set(categories.split(',')) == set(output_categories) result = runner.invoke( config.config.commands["asic-sdk-health-event"].commands["suppress"], - [severity, "none"], obj=db) + [severity, "--category-list", "none"], obj=db) assert result.exit_code == 0 assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) result = runner.invoke( config.config.commands["asic-sdk-health-event"].commands["suppress"], - [severity, "unknown"], obj=db) + [severity, "--category-list", "unknown"], obj=db) assert result.exit_code != 0 assert "Invalid category(ies): {'unknown'}" in result.output assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) + def test_config_suppress_asic_sdk_health_event_mix_category_list_max_events(self): + severity = "fatal" + dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') + + runner = CliRunner() + db = Db() + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "--category-list", "cpu_hw", "--max-events", "10"], obj=db) + assert result.exit_code == 0 + output_categories = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['categories'] + output_max_events = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['max_events'] + assert output_categories == ["cpu_hw"] + assert output_max_events == "10" + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "--category-list", "none", "--max-events", "10"], obj=db) + assert result.exit_code == 0 + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('categories') + output_max_events = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['max_events'] + assert output_max_events == "10" + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "--category-list", "cpu_hw", "--max-events", "0"], obj=db) + assert result.exit_code == 0 + output_categories = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['categories'] + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('max_events') + assert output_categories == ["cpu_hw"] + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "--max-events", "10"], obj=db) + assert result.exit_code == 0 + output_categories = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['categories'] + output_max_events = db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity)['max_events'] + assert output_categories == ["cpu_hw"] + assert output_max_events == "10" + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "--category-list", "none", "--max-events", "0"], obj=db) + assert result.exit_code == 0 + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('categories') + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('max_events') + + @pytest.mark.parametrize("severity", ["fatal", "warning", "notice"]) def test_config_suppress_asic_sdk_health_event_unsupported_severity(self, severity): dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db_no_' + severity) @@ -76,7 +125,7 @@ def test_config_suppress_asic_sdk_health_event_unsupported_severity(self, severi result = runner.invoke( config.config.commands["asic-sdk-health-event"].commands["suppress"], - [severity, "all"], obj=db) + [severity, "--category-list", "all"], obj=db) assert result.exit_code != 0 assert "Suppressing ASIC/SDK health {} event is not supported on the platform".format(severity) in result.output assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) @@ -89,7 +138,7 @@ def test_config_suppress_asic_sdk_health_event_unsupported_event(self): result = runner.invoke( config.config.commands["asic-sdk-health-event"].commands["suppress"], - ["fatal", "all"], obj=db) + ["fatal", "--category-list", "all"], obj=db) assert result.exit_code != 0 assert "ASIC/SDK health event is not supported on the platform" in result.output assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", "fatal") @@ -110,17 +159,18 @@ def test_show_asic_sdk_health_event_received(self): def test_show_asic_sdk_health_event_suppressed_category_list(self): expected_output = \ - "Severity Suppressed category-list\n" - "---------- --------------------------\n" - "fatal software\n" - "warning firmware,asic_hw\n" + "Severity Suppressed category-list Max events\n" + "---------- -------------------------- ------------\n" + "fatal software unlimited\n" + "notice none 1024\n" + "warning firmware,asic_hw 10240\n" dbconnector.dedicated_dbs['STATE_DB'] = os.path.join(mock_db_path, 'state_db') dbconnector.dedicated_dbs['CONFIG_DB'] = os.path.join(mock_db_path, 'config_db') runner = CliRunner() db = Db() - result = runner.invoke(show.cli.commands["asic-sdk-health-event"].commands["suppressed-category-list"], [], obj=db) + result = runner.invoke(show.cli.commands["asic-sdk-health-event"].commands["suppress-configuration"], [], obj=db) assert result.exit_code == 0 assert expected_output in result.output From 8a235093dace0b2ac9ed677e0eecd0f880ad1212 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Tue, 16 Jan 2024 04:07:58 +0000 Subject: [PATCH 7/9] Fix review comments Signed-off-by: Stephen Sun --- config/main.py | 14 +++----------- tests/asic_sdk_health_event_test.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/config/main.py b/config/main.py index 50a94bfd32..2f03d26e1f 100644 --- a/config/main.py +++ b/config/main.py @@ -7413,17 +7413,9 @@ def handle_asic_sdk_health_suppress(db, severity, category_list, max_events, nam else: suppressedCategoriesList = category_list.split(',') - unsupportCategories = set(suppressedCategoriesList) - categories - if unsupportCategories: - ctx.fail("Invalid category(ies): {}".format(unsupportCategories)) - - if max_events: - try: - max_events_number = int(max_events) - if max_events_number < 0: - ctx.fail("Invalid max-events: {}".format(max_events)) - except ValueError as e: - ctx.fail("Invalid max-events: {}".format(max_events)) + unsupportCategories = set(suppressedCategoriesList) - categories + if unsupportCategories: + ctx.fail("Invalid category(ies): {}".format(unsupportCategories)) for ns in namespace_list: if namespace and namespace != ns: diff --git a/tests/asic_sdk_health_event_test.py b/tests/asic_sdk_health_event_test.py index 3d708c8460..544213fb9e 100644 --- a/tests/asic_sdk_health_event_test.py +++ b/tests/asic_sdk_health_event_test.py @@ -115,6 +115,22 @@ def test_config_suppress_asic_sdk_health_event_mix_category_list_max_events(self assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('categories') assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('max_events') + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "--max-events", "-50"], obj=db) + assert result.exit_code != 0 + assert "-50 is smaller than the minimum valid value 0" in result.output + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('categories') + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('max_events') + + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], + [severity, "--max-events", "NaN"], obj=db) + assert result.exit_code != 0 + assert "NaN is not a valid integer" in result.output + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('categories') + assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('max_events') + @pytest.mark.parametrize("severity", ["fatal", "warning", "notice"]) def test_config_suppress_asic_sdk_health_event_unsupported_severity(self, severity): From d9b242520e3e8f04272d3160ca671a901bfa3e00 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Tue, 16 Jan 2024 04:19:47 +0000 Subject: [PATCH 8/9] Fix review comments in show Signed-off-by: Stephen Sun --- show/main.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/show/main.py b/show/main.py index 9740711f33..3510bdd21f 100755 --- a/show/main.py +++ b/show/main.py @@ -2185,9 +2185,9 @@ def suppress_configuration(db, namespace): ','.join(suppressSeverities[severity].get('categories', ['none'])), suppressSeverities[severity].get('max_events', 'unlimited')]) - if supported: click.echo(tabulate(body, header)) - else: + + if not supported: ctx = click.get_current_context() ctx.fail("ASIC/SDK health event is not supported on the platform") @@ -2229,9 +2229,9 @@ def received(db, namespace): event = state_db.get_all(state_db.STATE_DB, key) body.append([key.split('|')[1], event.get('severity'), event.get('category'), event.get('description')]) - if supported: click.echo(tabulate(body, header)) - else: + + if not supported: ctx = click.get_current_context() ctx.fail("ASIC/SDK health event is not supported on the platform") From e6e31f55f49a803956ee74acd944611375c1f210 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Mon, 29 Jan 2024 12:54:07 +0000 Subject: [PATCH 9/9] Fix issue: at least one argument should be provided Signed-off-by: Stephen Sun --- config/main.py | 6 ++++++ tests/asic_sdk_health_event_test.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/config/main.py b/config/main.py index 2f03d26e1f..8d511eaadc 100644 --- a/config/main.py +++ b/config/main.py @@ -7433,8 +7433,10 @@ def handle_asic_sdk_health_suppress(db, severity, category_list, max_events, nam entry = config_db.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity) need_remove = False + noarg = True if category_list: + noarg = False if suppressedCategoriesList: entry["categories"] = suppressedCategoriesList elif entry.get("categories"): @@ -7442,12 +7444,16 @@ def handle_asic_sdk_health_suppress(db, severity, category_list, max_events, nam need_remove = True if max_events is not None: + noarg = False if max_events > 0: entry["max_events"] = max_events elif entry.get("max_events"): entry.pop("max_events") need_remove = True + if noarg: + ctx.fail("At least one argument should be provided!") + if entry: config_db.set_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity, entry) elif need_remove: diff --git a/tests/asic_sdk_health_event_test.py b/tests/asic_sdk_health_event_test.py index 544213fb9e..d709a693ca 100644 --- a/tests/asic_sdk_health_event_test.py +++ b/tests/asic_sdk_health_event_test.py @@ -131,6 +131,10 @@ def test_config_suppress_asic_sdk_health_event_mix_category_list_max_events(self assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('categories') assert not db.cfgdb.get_entry("SUPPRESS_ASIC_SDK_HEALTH_EVENT", severity).get('max_events') + result = runner.invoke( + config.config.commands["asic-sdk-health-event"].commands["suppress"], severity, obj=db) + assert result.exit_code != 0 + assert "At least one argument should be provided" in result.output @pytest.mark.parametrize("severity", ["fatal", "warning", "notice"]) def test_config_suppress_asic_sdk_health_event_unsupported_severity(self, severity):