Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions files/build_templates/dhcp_relay_regex.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@
"tag": "dhcp-relay-discard",
"regex": "Discarding packet received on ([a-zA-Z0-9-_]*) interface that has no IPv4 address assigned.",
"params": [ "ifname" ]
},
{
"tag": "dhcp-relay-bind-failure",
"regex": "Failed to bind socket to (link local|global) ipv6 address on interface ([a-zA-Z0-9]*)",
"params": [ "type:ret=(arg==\"link local\")and\"local\"or\"global\")", "vlan" ]
}
]
14 changes: 14 additions & 0 deletions files/image_config/monit/memory_checker
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@ import re

import docker

from swsscommon import swsscommon

EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "mem-threshold-exceeded"

def get_command_result(command):
"""Executes the command and return the resulting output.
Expand Down Expand Up @@ -54,6 +58,14 @@ def get_command_result(command):

return command_stdout.strip()

def publish_events(container_name, mem_usage_bytes, threshold_value):
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
params = swsscommon.FieldValueMap()
params["ctr_name"] = container_name
params["mem_usage"] = mem_usage_bytes
params["threshold"] = threshold_value
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
swsscommon.events_deinit_publisher(events_handle)

def check_memory_usage(container_name, threshold_value):
"""Checks the memory usage of a container and writes an alerting messages into
Expand Down Expand Up @@ -89,6 +101,8 @@ def check_memory_usage(container_name, threshold_value):
.format(container_name, mem_usage_bytes, threshold_value))
syslog.syslog(syslog.LOG_INFO, "[{}]: Memory usage ({} Bytes) is larger than the threshold ({} Bytes)!"
.format(container_name, mem_usage_bytes, threshold_value))
# publish event
publish_events(container_name, mem_usage_bytes, threshold_value)
sys.exit(3)
else:
syslog.syslog(syslog.LOG_ERR, "[memory_checker] Failed to retrieve memory value from '{}'"
Expand Down
10 changes: 10 additions & 0 deletions files/scripts/supervisor-proc-exit-listener
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ SELECT_TIMEOUT_SECS = 1.0
# Alerting message will be written into syslog in the following interval
ALERTING_INTERVAL_SECS = 60

EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-exited-unexpectedly"

def get_critical_group_and_process_list():
"""
Expand Down Expand Up @@ -106,6 +108,13 @@ def get_autorestart_state(container_name):

return is_auto_restart

def publish_events(process_name, container_name):
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
params = swsscommon.FieldValueMap()
params["process_name"] = process_name
params["ctr_name"] = container_name
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
swsscommon.events_deinit_publisher(events_handle)

def main(argv):
container_name = None
Expand Down Expand Up @@ -145,6 +154,7 @@ def main(argv):
MSG_FORMAT_STR = "Process '{}' exited unexpectedly. Terminating supervisor '{}'"
msg = MSG_FORMAT_STR.format(payload_headers['processname'], container_name)
syslog.syslog(syslog.LOG_INFO, msg)
publish_events(payload_headers['processname'], container_name)
os.kill(os.getppid(), signal.SIGTERM)
else:
process_under_alerting[process_name]["last_alerted"] = time.time()
Expand Down
12 changes: 12 additions & 0 deletions src/system-health/health_checker/service_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
SYSLOG_IDENTIFIER = 'service_checker'
logger = Logger(log_identifier=SYSLOG_IDENTIFIER)

EVENTS_PUBLISHER_SOURCE = "sonic-events-host"
EVENTS_PUBLISHER_TAG = "process-not-running"

class ServiceChecker(HealthChecker):
"""
Expand Down Expand Up @@ -308,6 +310,15 @@ def _parse_supervisorctl_status(self, process_status):
data[items[0].strip()] = items[1].strip()
return data

def publish_events(self, container_name, critical_process_list):
events_handle = swsscommon.events_init_publisher(EVENTS_PUBLISHER_SOURCE)
params = swsscommon.FieldValueMap()
params["ctr_name"] = container_name
for process_name in critical_process_list:
params["process_name"] = process_name
swsscommon.event_publish(events_handle, EVENTS_PUBLISHER_TAG, params)
swsscommon.events_deinit_publisher(events_handle)

def check_process_existence(self, container_name, critical_process_list, config, feature_table):
"""Check whether the process in the specified container is running or not.

Expand All @@ -332,6 +343,7 @@ def check_process_existence(self, container_name, critical_process_list, config,
if process_status is None:
for process_name in critical_process_list:
self.set_object_not_ok('Process', '{}:{}'.format(container_name, process_name), "'{}' is not running".format(process_name))
publish_events(container_name, critical_process_list)
return

process_status = self._parse_supervisorctl_status(process_status.strip().splitlines())
Expand Down