Skip to content

Commit 054ed34

Browse files
authored
Support ASIC/SDK health event (#3020)
* ASIC/SDK health event Support ASIC/SDK health event Fetch capabilities and expose to STATE_DB Register the event handler and categories for each severity when supported Handle suppress ASIC/SDK health event categories Handle ASIC/SDK health event reported by SAI redis in the callback context
1 parent 5ef7370 commit 054ed34

12 files changed

Lines changed: 839 additions & 18 deletions

File tree

orchagent/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ CFLAGS_SAI = -I /usr/include/sai
1717
swssdir = $(datadir)/swss
1818

1919
dist_swss_DATA = \
20+
eliminate_events.lua \
2021
rif_rates.lua \
2122
pfc_detect_innovium.lua \
2223
pfc_detect_mellanox.lua \

orchagent/eliminate_events.lua

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
-- KEYS - None
2+
-- ARGV - None
3+
4+
local state_db = "6"
5+
local config_db = "4"
6+
7+
local result = {}
8+
9+
redis.call('SELECT', config_db)
10+
local severity_keys = redis.call('KEYS', 'SUPPRESS_ASIC_SDK_HEALTH_EVENT*')
11+
if #severity_keys == 0 then
12+
return result
13+
end
14+
15+
local max_events = {}
16+
for i = 1, #severity_keys, 1 do
17+
local max_event = redis.call('HGET', severity_keys[i], 'max_events')
18+
if max_event then
19+
max_events[string.sub(severity_keys[i], 32, -1)] = tonumber(max_event)
20+
end
21+
end
22+
23+
if not next (max_events) then
24+
return result
25+
end
26+
27+
redis.call('SELECT', state_db)
28+
local events = {}
29+
30+
local event_keys = redis.call('KEYS', 'ASIC_SDK_HEALTH_EVENT_TABLE*')
31+
32+
if #event_keys == 0 then
33+
return result
34+
end
35+
36+
for i = 1, #event_keys, 1 do
37+
local severity = redis.call('HGET', event_keys[i], 'severity')
38+
if max_events[severity] ~= nil then
39+
if events[severity] == nil then
40+
events[severity] = {}
41+
end
42+
table.insert(events[severity], event_keys[i])
43+
end
44+
end
45+
46+
for severity in pairs(max_events) do
47+
local number_received_events = 0
48+
if events[severity] ~= nil then
49+
number_received_events = #events[severity]
50+
end
51+
if number_received_events > max_events[severity] then
52+
table.sort(events[severity])
53+
local number_to_eliminate = number_received_events - max_events[severity]
54+
for i = 1, number_to_eliminate, 1 do
55+
redis.call('DEL', events[severity][i])
56+
end
57+
table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", eliminated " .. number_to_eliminate)
58+
else
59+
table.insert(result, severity .. " events: maximum " .. max_events[severity] .. ", received " .. number_received_events .. ", not exceeding the maximum")
60+
end
61+
end
62+
63+
return result

orchagent/notifications.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ extern "C" {
44

55
#include "logger.h"
66
#include "notifications.h"
7+
#include "switchorch.h"
8+
9+
extern SwitchOrch *gSwitchOrch;
710

811
#ifdef ASAN_ENABLED
912
#include <sanitizer/lsan_interface.h>
@@ -40,6 +43,12 @@ void on_switch_shutdown_request(sai_object_id_t switch_id)
4043
/* TODO: Later a better restart story will be told here */
4144
SWSS_LOG_ERROR("Syncd stopped");
4245

46+
if (gSwitchOrch->isFatalEventReceived())
47+
{
48+
SWSS_LOG_ERROR("Orchagent aborted due to fatal SAI error received");
49+
abort();
50+
}
51+
4352
/*
4453
The quick_exit() is used instead of the exit() to avoid a following data race:
4554
* the exit() calls the destructors for global static variables (e.g.BufferOrch::m_buffer_type_maps)
@@ -59,3 +68,18 @@ void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, s
5968
// don't use this event handler, because it runs by libsairedis in a separate thread
6069
// which causes concurrency access to the DB
6170
}
71+
72+
void on_switch_asic_sdk_health_event(sai_object_id_t switch_id,
73+
sai_switch_asic_sdk_health_severity_t severity,
74+
sai_timespec_t timestamp,
75+
sai_switch_asic_sdk_health_category_t category,
76+
sai_switch_health_data_t data,
77+
const sai_u8_list_t description)
78+
{
79+
gSwitchOrch->onSwitchAsicSdkHealthEvent(switch_id,
80+
severity,
81+
timestamp,
82+
category,
83+
data,
84+
description);
85+
}

orchagent/notifications.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,12 @@ void on_twamp_session_event(uint32_t count, sai_twamp_session_event_notification
1212
// The function prototype information can be found here:
1313
// https://github.com/sonic-net/sonic-sairedis/blob/master/meta/NotificationSwitchShutdownRequest.cpp#L49
1414
void on_switch_shutdown_request(sai_object_id_t switch_id);
15+
1516
void on_port_host_tx_ready(sai_object_id_t switch_id, sai_object_id_t port_id, sai_port_host_tx_ready_status_t m_portHostTxReadyStatus);
17+
18+
void on_switch_asic_sdk_health_event(sai_object_id_t switch_id,
19+
sai_switch_asic_sdk_health_severity_t severity,
20+
sai_timespec_t timestamp,
21+
sai_switch_asic_sdk_health_category_t category,
22+
sai_switch_health_data_t data,
23+
const sai_u8_list_t description);

orchagent/orchdaemon.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,12 @@ bool OrchDaemon::init()
117117
TableConnector app_switch_table(m_applDb, APP_SWITCH_TABLE_NAME);
118118
TableConnector conf_asic_sensors(m_configDb, CFG_ASIC_SENSORS_TABLE_NAME);
119119
TableConnector conf_switch_hash(m_configDb, CFG_SWITCH_HASH_TABLE_NAME);
120+
TableConnector conf_suppress_asic_sdk_health_categories(m_configDb, CFG_SUPPRESS_ASIC_SDK_HEALTH_EVENT_NAME);
120121

121122
vector<TableConnector> switch_tables = {
122123
conf_switch_hash,
123124
conf_asic_sensors,
125+
conf_suppress_asic_sdk_health_categories,
124126
app_switch_table
125127
};
126128

orchagent/p4orch/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ p4orch_tests_SOURCES = $(ORCHAGENT_DIR)/orch.cpp \
3535
$(ORCHAGENT_DIR)/flex_counter/flow_counter_handler.cpp \
3636
$(ORCHAGENT_DIR)/port/port_capabilities.cpp \
3737
$(ORCHAGENT_DIR)/port/porthlpr.cpp \
38+
$(ORCHAGENT_DIR)/notifications.cpp \
3839
$(P4ORCH_DIR)/p4oidmapper.cpp \
3940
$(P4ORCH_DIR)/p4orch.cpp \
4041
$(P4ORCH_DIR)/p4orch_util.cpp \

orchagent/p4orch/tests/test_main.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ sai_object_id_t kMirrorSessionOid1 = 9001;
3535
char *gMirrorSession2 = "mirror-session-2";
3636
sai_object_id_t kMirrorSessionOid2 = 9002;
3737
sai_object_id_t gUnderlayIfId;
38+
string gMyAsicName = "";
39+
event_handle_t g_events_handle;
3840

3941
#define DEFAULT_BATCH_SIZE 128
4042
#define DEFAULT_MAX_BULK_SIZE 1000

0 commit comments

Comments
 (0)