diff --git a/dockers/docker-sonic-telemetry/Dockerfile.j2 b/dockers/docker-sonic-telemetry/Dockerfile.j2 index e94441b4f06..7bc70b0fb8f 100644 --- a/dockers/docker-sonic-telemetry/Dockerfile.j2 +++ b/dockers/docker-sonic-telemetry/Dockerfile.j2 @@ -28,7 +28,7 @@ RUN apt-get clean -y && \ COPY ["start.sh", "telemetry.sh", "dialout.sh", "/usr/bin/"] COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] -COPY ["files/supervisor-proc-exit-listener", "/usr/bin"] +COPY ["supervisor-proc-exit-listener", "/usr/bin"] COPY ["critical_processes", "/etc/supervisor"] ENTRYPOINT ["/usr/bin/supervisord"] diff --git a/dockers/docker-sonic-telemetry/supervisor-proc-exit-listener b/dockers/docker-sonic-telemetry/supervisor-proc-exit-listener new file mode 100755 index 00000000000..42fb7ed2246 --- /dev/null +++ b/dockers/docker-sonic-telemetry/supervisor-proc-exit-listener @@ -0,0 +1,59 @@ +#!/usr/bin/env python + +import os +import signal +import sys +import syslog +import swsssdk + +from supervisor import childutils + +CONTAINER_NAME = 'telemetry' + +# Contents of file should be the names of critical processes (as defined in +# supervisor.conf file), one per line +CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' + +# This table in database contains the features for container and each +# feature in a row will be configured a state or number. +CONTAINER_FEATURE_TABLE_NAME = 'CONTAINER_FEATURE' + +def main(): + # Read the list of critical processes from a file + with open(CRITICAL_PROCESSES_FILE, 'r') as f: + critical_processes = [line.rstrip('\n') for line in f] + + while True: + # Transition from ACKNOWLEDGED to READY + childutils.listener.ready() + + line = sys.stdin.readline() + headers = childutils.get_headers(line) + payload = sys.stdin.read(int(headers['len'])) + + # Transition from READY to ACKNOWLEDGED + childutils.listener.ok() + + # We only care about PROCESS_STATE_EXITED events + if headers['eventname'] == 'PROCESS_STATE_EXITED': + payload_headers, payload_data = childutils.eventdata(payload + '\n') + + expected = int(payload_headers['expected']) + processname = payload_headers['processname'] + groupname = payload_headers['groupname'] + + config_db = swsssdk.ConfigDBConnector() + config_db.connect() + docker_config = config_db.get_table(CONTAINER_FEATURE_TABLE_NAME) + if docker_config and docker_config.has_key(CONTAINER_NAME): + restart_feature = docker_config[CONTAINER_NAME].get('auto_restart') + + # If auto-restart feature is enabled and a critical process exited unexpectedly, terminate supervisor + if restart_feature == 'enabled' and expected == 0 and (processname in critical_processes or groupname in critical_processes): + MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." + msg = MSG_FORMAT_STR.format(payload_headers['processname']) + syslog.syslog(syslog.LOG_INFO, msg) + os.kill(os.getppid(), signal.SIGTERM) + +if __name__ == "__main__": + main() diff --git a/rules/docker-telemetry.mk b/rules/docker-telemetry.mk index 36113630633..157b95ad139 100644 --- a/rules/docker-telemetry.mk +++ b/rules/docker-telemetry.mk @@ -29,5 +29,4 @@ $(DOCKER_TELEMETRY)_RUN_OPT += --net=host --privileged -t $(DOCKER_TELEMETRY)_RUN_OPT += -v /etc/sonic:/etc/sonic:ro $(DOCKER_TELEMETRY)_RUN_OPT += --mount type=bind,source="/var/platform/",target="/mnt/platform/" -$(DOCKER_TELEMETRY)_FILES += $(SUPERVISOR_PROC_EXIT_LISTENER_SCRIPT) $(DOCKER_TELEMETRY)_BASE_IMAGE_FILES += monit_telemetry:/etc/monit/conf.d