From 08c807f76e8a4a124f41db065963b2c9f1d5edd8 Mon Sep 17 00:00:00 2001 From: Prasanth Kunjum Veettil Date: Wed, 25 Sep 2019 20:52:18 -0700 Subject: [PATCH] System recovery when syncd crashes When syncd gets terminated unexpectedly system goes into unstable state. And system stays in that state unless a reboot is triggered by the user. It doesn't get receovered by iteslef. This is the problem getting addressed with this change. When syncd process crashes, we cannot restart syncd alone as it has dependency on orch-agent. So same mechanism when SAI API call failure happens will be used here. syncd process state is monitored and when it crashes, shutdown notification is been sent to orch-agent which eventually result in SWSS restart and syncd restart, to recover the system. --- files/scripts/supervisor-proc-exit-listener | 3 ++ .../broadcom/docker-syncd-brcm/Dockerfile.j2 | 3 +- .../docker-syncd-brcm/critical_processes | 1 + .../broadcom/docker-syncd-brcm/custom_handler | 31 +++++++++++++++++++ .../docker-syncd-brcm/supervisord.conf | 6 ++++ 5 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 platform/broadcom/docker-syncd-brcm/critical_processes create mode 100644 platform/broadcom/docker-syncd-brcm/custom_handler diff --git a/files/scripts/supervisor-proc-exit-listener b/files/scripts/supervisor-proc-exit-listener index 6bc62fc400c..7ad1bbfd52f 100755 --- a/files/scripts/supervisor-proc-exit-listener +++ b/files/scripts/supervisor-proc-exit-listener @@ -10,6 +10,7 @@ from supervisor import childutils # Contents of file should be the names of critical processes (as defined in # supervisor.conf file), one per line CRITICAL_PROCESSES_FILE = '/etc/supervisor/critical_processes' +CUSTOM_HANDLER = '/usr/bin/custom_handler' def main(): # Read the list of critical processes from a file @@ -39,6 +40,8 @@ def main(): MSG_FORMAT_STR = "Process {} exited unxepectedly. Terminating supervisor..." msg = MSG_FORMAT_STR.format(payload_headers['processname']) syslog.syslog(syslog.LOG_INFO, msg) + if os.path.exists(CUSTOM_HANDLER): + os.system(CUSTOM_HANDLER) os.kill(os.getppid(), signal.SIGTERM) if __name__ == "__main__": diff --git a/platform/broadcom/docker-syncd-brcm/Dockerfile.j2 b/platform/broadcom/docker-syncd-brcm/Dockerfile.j2 index 328f698fdb6..b0700d3c809 100755 --- a/platform/broadcom/docker-syncd-brcm/Dockerfile.j2 +++ b/platform/broadcom/docker-syncd-brcm/Dockerfile.j2 @@ -22,10 +22,11 @@ debs/{{ deb }}{{' '}} ## TODO: add kmod into Depends RUN apt-get install -yf kmod -COPY ["files/dsserve", "files/bcmcmd", "start.sh", "bcmsh", "/usr/bin/"] +COPY ["files/dsserve", "files/bcmcmd", "start.sh", "bcmsh", "files/supervisor-proc-exit-listener", "custom_handler", "/usr/bin/"] RUN chmod +x /usr/bin/dsserve /usr/bin/bcmcmd COPY ["supervisord.conf", "/etc/supervisor/conf.d/"] +COPY ["critical_processes", "/etc/supervisor/"] ## Clean up RUN apt-get clean -y; apt-get autoclean -y; apt-get autoremove -y diff --git a/platform/broadcom/docker-syncd-brcm/critical_processes b/platform/broadcom/docker-syncd-brcm/critical_processes new file mode 100644 index 00000000000..6082f242b87 --- /dev/null +++ b/platform/broadcom/docker-syncd-brcm/critical_processes @@ -0,0 +1 @@ +syncd diff --git a/platform/broadcom/docker-syncd-brcm/custom_handler b/platform/broadcom/docker-syncd-brcm/custom_handler new file mode 100644 index 00000000000..822f8804da7 --- /dev/null +++ b/platform/broadcom/docker-syncd-brcm/custom_handler @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# This is custom handler for supervisor-proc-exit-listener +# docker specfic handling when critical processes exit can be added here. +import os +import sys +import syslog +import json + +from swsssdk import SonicV2Connector + +class OaNotif(object): + + def __init__(self): + super(OaNotif,self).__init__() + self.db = SonicV2Connector(host="127.0.0.1") + self.db.connect(self.db.ASIC_DB, False) + return + + def shutdown(self): + opdata = ["switch_shutdown_request",""] + msg = json.dumps(opdata,separators=(',',':')) + self.db.publish('ASIC_DB','NOTIFICATIONS', msg) + return + +def main(): + orch_agent_notif = OaNotif() + syslog.syslog(syslog.LOG_INFO, "Restarting swss to recover the system") + orch_agent_notif.shutdown() + +if __name__ == "__main__": + main() diff --git a/platform/broadcom/docker-syncd-brcm/supervisord.conf b/platform/broadcom/docker-syncd-brcm/supervisord.conf index a2e0743b1cf..e366650244a 100644 --- a/platform/broadcom/docker-syncd-brcm/supervisord.conf +++ b/platform/broadcom/docker-syncd-brcm/supervisord.conf @@ -11,6 +11,12 @@ autorestart=false stdout_logfile=syslog stderr_logfile=syslog +[eventlistener:supervisor-proc-exit-listener] +command=/usr/bin/supervisor-proc-exit-listener +events=PROCESS_STATE_EXITED +autostart=true +autorestart=unexpected + [program:rsyslogd] command=/usr/sbin/rsyslogd -n priority=2