From b1a6adf43fd5b4bc2e40d38f3caeadb0682bb3a1 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Sat, 31 Aug 2019 03:07:28 +0300 Subject: [PATCH 1/6] [syncd.sh] stop pmon ahead of syncd in flows except warm reboot --- files/scripts/syncd.sh | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index 0c3be805e7d..80bbc4eebee 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -103,11 +103,21 @@ start() { if [[ x"$WARM_BOOT" != x"true" ]]; then if [[ x"$(/bin/systemctl is-active pmon)" == x"active" ]]; then + # The only possibility that pmon is active, is that pmon starts ahead of syncd, + # Thus, pmon has been started and stopped for an extra time which is unnecessary + # and consume more starting time. + # In this sense, should we add explicit dependency of syncd for pmon + # and remove the below "stop pmon"? /bin/systemctl stop pmon /usr/bin/hw-management.sh chipdown /bin/systemctl restart pmon + debug "Assertion failure, pmon is active while syncd starting..." else /usr/bin/hw-management.sh chipdown + debug "Triger pmon starting" + debug "Starting pmon service..." + /bin/systemctl restart pmon + debug "Started pmon service" fi fi @@ -150,6 +160,12 @@ stop() { TYPE=cold fi + if [[ x$sonic_asic_platform == x"mellanox" ]] && [[ x$TYPE == x"cold" ]]; then + debug "Stopping pmon service ahead of syncd..." + /bin/systemctl stop pmon + debug "Stopped pmon service." + fi + if [[ x$sonic_asic_platform != x"mellanox" ]] || [[ x$TYPE != x"cold" ]]; then debug "${TYPE} shutdown syncd process ..." /usr/bin/docker exec -i syncd /usr/bin/syncd_request_shutdown --${TYPE} From 7517d5ed10529cd2d74df1d7246698d4aa9ad007 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Thu, 5 Sep 2019 04:03:27 +0300 Subject: [PATCH 2/6] rephase --- files/scripts/syncd.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index 80bbc4eebee..77bf482a9b6 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -106,12 +106,11 @@ start() { # The only possibility that pmon is active, is that pmon starts ahead of syncd, # Thus, pmon has been started and stopped for an extra time which is unnecessary # and consume more starting time. - # In this sense, should we add explicit dependency of syncd for pmon - # and remove the below "stop pmon"? + # In this sense, should we add explicit dependency on syncd for pmon + # and then remove the below "stop pmon"? /bin/systemctl stop pmon /usr/bin/hw-management.sh chipdown /bin/systemctl restart pmon - debug "Assertion failure, pmon is active while syncd starting..." else /usr/bin/hw-management.sh chipdown debug "Triger pmon starting" From 54686462d7d0d06d071e39bda9a350833f91af45 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Mon, 9 Sep 2019 17:26:54 +0300 Subject: [PATCH 3/6] [syncd.sh] extract common code out of the if/else/fi --- files/scripts/syncd.sh | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index 77bf482a9b6..cc9cf5ce3b3 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -109,15 +109,12 @@ start() { # In this sense, should we add explicit dependency on syncd for pmon # and then remove the below "stop pmon"? /bin/systemctl stop pmon - /usr/bin/hw-management.sh chipdown - /bin/systemctl restart pmon - else - /usr/bin/hw-management.sh chipdown - debug "Triger pmon starting" - debug "Starting pmon service..." - /bin/systemctl restart pmon - debug "Started pmon service" + debug "pmon is active while syncd starting, stop it first" fi + /usr/bin/hw-management.sh chipdown + debug "Starting pmon service..." + /bin/systemctl restart pmon + debug "Started pmon service" fi if [[ x"$BOOT_TYPE" == x"fast" ]]; then From cce1785519baf2b687e84faf48b0ed16fe772791 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Tue, 10 Sep 2019 08:31:02 +0300 Subject: [PATCH 4/6] [syncd.sh] remove unused comments --- files/scripts/syncd.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index cc9cf5ce3b3..dd00f507517 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -103,11 +103,6 @@ start() { if [[ x"$WARM_BOOT" != x"true" ]]; then if [[ x"$(/bin/systemctl is-active pmon)" == x"active" ]]; then - # The only possibility that pmon is active, is that pmon starts ahead of syncd, - # Thus, pmon has been started and stopped for an extra time which is unnecessary - # and consume more starting time. - # In this sense, should we add explicit dependency on syncd for pmon - # and then remove the below "stop pmon"? /bin/systemctl stop pmon debug "pmon is active while syncd starting, stop it first" fi From 226039c30bd7bd960a00b3b6ce2fb808823f40d7 Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Thu, 12 Sep 2019 16:37:52 +0300 Subject: [PATCH 5/6] [syncd.sh] move "start pmon" out of the "warm reboot != true" conditional branch to fix the issue that pmon not started after warm reboot --- files/scripts/syncd.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index dd00f507517..e282731980d 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -107,11 +107,12 @@ start() { debug "pmon is active while syncd starting, stop it first" fi /usr/bin/hw-management.sh chipdown - debug "Starting pmon service..." - /bin/systemctl restart pmon - debug "Started pmon service" fi + debug "Starting pmon service..." + /bin/systemctl start pmon + debug "Started pmon service" + if [[ x"$BOOT_TYPE" == x"fast" ]]; then /usr/bin/hw-management.sh chipupdis fi From 62509b6397f8e46b01c154b4cde8bb534bc95cda Mon Sep 17 00:00:00 2001 From: Stephen Sun Date: Fri, 13 Sep 2019 08:22:58 +0300 Subject: [PATCH 6/6] [syncd.sh,pmon.service] Prevent pmon from starting ahead of syncd During system starting, pmon isn't supposed to start ahead of syncd starting in order to avoid racing condition between syncd and pmon. Currently it is done by killing pmon if is alive when syncd is starting. However such implementation is still risky. Consider the following flow: 1. pmon is inactive when syncd.sh is checking. but syncd.sh is scheduled out somehow just ahead of "chipdown" called 2. systemd is switched in and starts pmon service 3. at this point, pmon and syncd are running simultaneously, critical section broken and racing condition formed To prevent that issue, ony solution is to add syncd as "After" in pmon.service, which ensure that whenever pmon starts syncd has been started. However, dong so requires to defer starting pmon.service after syncd.service has fully started otherwise a deadlock is formed as following: 1. syncd.sh starts pmon ahead of itself fully started, while 2. pmon not being able to start due to syncd, one of its "After", not fully started. 3. as a result, syncd and pmon have to wait for each other forever To solve that, move starting pmon.service to "wait()" so that pmon is started after syncd fully started, breaking the deadlock. --- files/build_templates/pmon.service.j2 | 3 +++ files/scripts/syncd.sh | 9 +++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/files/build_templates/pmon.service.j2 b/files/build_templates/pmon.service.j2 index 33f3173b488..9c422625681 100644 --- a/files/build_templates/pmon.service.j2 +++ b/files/build_templates/pmon.service.j2 @@ -2,6 +2,9 @@ Description=Platform monitor container Requires=updategraph.service After=updategraph.service +{% if sonic_asic_platform == 'mellanox' %} +After=syncd.service +{% endif %} Before=ntp-config.service [Service] diff --git a/files/scripts/syncd.sh b/files/scripts/syncd.sh index e282731980d..c9064657321 100755 --- a/files/scripts/syncd.sh +++ b/files/scripts/syncd.sh @@ -109,10 +109,6 @@ start() { /usr/bin/hw-management.sh chipdown fi - debug "Starting pmon service..." - /bin/systemctl start pmon - debug "Started pmon service" - if [[ x"$BOOT_TYPE" == x"fast" ]]; then /usr/bin/hw-management.sh chipupdis fi @@ -136,6 +132,11 @@ start() { } wait() { + if [[ x"$sonic_asic_platform" == x"mellanox" ]]; then + debug "Starting pmon service..." + /bin/systemctl start pmon + debug "Started pmon service" + fi /usr/bin/${SERVICE}.sh wait }