From 9c7db7b74bdc34e312284512b6f955d04a444e9b Mon Sep 17 00:00:00 2001 From: kcudnik Date: Sun, 25 Jul 2021 18:38:30 +0200 Subject: [PATCH 1/4] [syncd] Add event name to timer watchdog --- syncd/Syncd.cpp | 29 +++++++++++++++++------------ syncd/Syncd.h | 3 +++ syncd/TimerWatchdog.cpp | 39 +++++++++++++++++++++++++++++++-------- syncd/TimerWatchdog.h | 8 ++++++++ syncd/WatchdogScope.cpp | 22 ++++++++++++++++++++++ syncd/WatchdogScope.h | 21 +++++++++++++++++++++ 6 files changed, 102 insertions(+), 20 deletions(-) create mode 100644 syncd/WatchdogScope.cpp create mode 100644 syncd/WatchdogScope.h diff --git a/syncd/Syncd.cpp b/syncd/Syncd.cpp index d3332eab44..f9cd2f9bfd 100644 --- a/syncd/Syncd.cpp +++ b/syncd/Syncd.cpp @@ -14,7 +14,7 @@ #include "RedisSelectableChannel.h" #include "ZeroMQSelectableChannel.h" #include "PerformanceIntervalTimer.h" -#include "TimerWatchdog.h" +#include "WatchdogScope.h" #include "sairediscommon.h" @@ -50,7 +50,8 @@ Syncd::Syncd( m_asicInitViewMode(false), // by default we are in APPLY view mode m_vendorSai(vendorSai), m_veryFirstRun(false), - m_enableSyncMode(false) + m_enableSyncMode(false), + m_timerWatchdog(10 * 1000000) // watch for executions over 30 seconds { SWSS_LOG_ENTER(); @@ -317,6 +318,8 @@ sai_status_t Syncd::processSingleEvent( return SAI_STATUS_SUCCESS; } + WatchdogScope ws(m_timerWatchdog, op + ":" + key); + if (op == REDIS_ASIC_STATE_COMMAND_CREATE) return processQuadEvent(SAI_COMMON_API_CREATE, kco); @@ -1950,6 +1953,8 @@ void Syncd::processFlexCounterGroupEvent( // TODO must be moved to go via ASIC c auto& op = kfvOp(kco); auto& values = kfvFieldsValues(kco); + WatchdogScope ws(m_timerWatchdog, op + ":" + groupName); + if (op == SET_COMMAND) { m_manager->addCounterPlugin(groupName, values); @@ -1978,6 +1983,8 @@ void Syncd::processFlexCounterEvent( // TODO must be moved to go via ASIC channe auto& key = kfvKey(kco); auto& op = kfvOp(kco); + WatchdogScope ws(m_timerWatchdog, op + ":" + key); + auto delimiter = key.find_first_of(":"); if (delimiter == std::string::npos) @@ -4366,11 +4373,9 @@ void Syncd::run() runMainLoop = false; } - TimerWatchdog twd(30 * 1000000); // watch for executions over 30 seconds + m_timerWatchdog.setCallback(timerWatchdogCallback); - twd.setCallback(timerWatchdogCallback); - - while(runMainLoop) + while (runMainLoop) { try { @@ -4378,8 +4383,6 @@ void Syncd::run() int result = s->select(&sel); - twd.setStartTime(); - if (sel == m_restartQuery.get()) { /* @@ -4399,6 +4402,8 @@ void Syncd::run() SWSS_LOG_NOTICE("drained queue"); + WatchdogScope ws(m_timerWatchdog, "restart query"); + shutdownType = handleRestartQuery(*m_restartQuery); if (shutdownType != SYNCD_RESTART_TYPE_PRE_SHUTDOWN) @@ -4468,8 +4473,6 @@ void Syncd::run() { SWSS_LOG_ERROR("select failed: %d", result); } - - twd.setEndTime(); } catch(const std::exception &e) { @@ -4486,11 +4489,11 @@ void Syncd::run() // make sure that if second exception will arise, then we break the loop m_commandLineOptions->m_disableExitSleep = true; - - twd.setEndTime(); } } + WatchdogScope ws(m_timerWatchdog, "shutting down syncd"); + if (shutdownType == SYNCD_RESTART_TYPE_WARM) { const char *warmBootWriteFile = profileGetValue(0, SAI_KEY_WARM_BOOT_WRITE_FILE); @@ -4566,6 +4569,8 @@ syncd_restart_type_t Syncd::handleRestartQuery( restartQuery.pop(op, data, values); + m_timerWatchdog.setEventName(op + ":" + data); + SWSS_LOG_NOTICE("received %s switch shutdown event", op.c_str()); return RequestShutdownCommandLineOptions::stringToRestartType(op); diff --git a/syncd/Syncd.h b/syncd/Syncd.h index dc3c5b9962..4cea8fac5a 100644 --- a/syncd/Syncd.h +++ b/syncd/Syncd.h @@ -17,6 +17,7 @@ #include "BreakConfig.h" #include "NotificationProducerBase.h" #include "SelectableChannel.h" +#include "TimerWatchdog.h" #include "meta/SaiAttributeList.h" @@ -489,5 +490,7 @@ namespace syncd std::shared_ptr m_contextConfig; std::shared_ptr m_breakConfig; + + TimerWatchdog m_timerWatchdog; }; } diff --git a/syncd/TimerWatchdog.cpp b/syncd/TimerWatchdog.cpp index c51588b930..6f50ca2a32 100644 --- a/syncd/TimerWatchdog.cpp +++ b/syncd/TimerWatchdog.cpp @@ -4,6 +4,8 @@ #include +#define MUTEX std::lock_guard _lock(m_mutex); + using namespace syncd; TimerWatchdog::TimerWatchdog( @@ -35,6 +37,7 @@ TimerWatchdog::~TimerWatchdog() void TimerWatchdog::setStartTime() { + MUTEX; SWSS_LOG_ENTER(); do @@ -42,11 +45,11 @@ void TimerWatchdog::setStartTime() m_startTimestamp = getTimeSinceEpoch(); } while (m_startTimestamp <= m_endTimestamp); // make sure new start time is always past last end time - } void TimerWatchdog::setEndTime() { + MUTEX; SWSS_LOG_ENTER(); do @@ -54,11 +57,34 @@ void TimerWatchdog::setEndTime() m_endTimestamp = getTimeSinceEpoch(); } while (m_endTimestamp <= m_startTimestamp); // make sure new end time is always past last start time + + auto span = m_endTimestamp - m_startTimestamp; + + if (span > m_warnTimespan) + { + SWSS_LOG_ERROR("event '%s' took %ld ms to execute", m_eventName.c_str(), span/1000); + } + else if (span > 50*1000) + { + SWSS_LOG_WARN("event '%s' took %ld ms to execute", m_eventName.c_str(), span/1000); + } + + m_eventName = ""; +} + +void TimerWatchdog::setEventName( + _In_ const std::string& eventName) +{ + MUTEX; + SWSS_LOG_ENTER(); + + m_eventName = eventName; } void TimerWatchdog::setCallback( _In_ std::function callback) { + MUTEX; SWSS_LOG_ENTER(); m_callback = callback; @@ -74,6 +100,8 @@ void TimerWatchdog::threadFunction() { std::this_thread::sleep_for(std::chrono::seconds(1)); + MUTEX; // to protect event name + // we make local copies, since executing functions can be so fast that // when we will read second time start timestamp it can be different // than previous one @@ -94,7 +122,7 @@ void TimerWatchdog::threadFunction() span = now - start; // this must be always non negative - SWSS_LOG_NOTICE("time span %ld ms", span/1000); + SWSS_LOG_NOTICE("time span %ld ms for '%s'", span/1000, m_eventName.c_str()); // TODO remove this if (span < 0) SWSS_LOG_THROW("negative span 'now - start': %ld - %ld", now, start); @@ -105,12 +133,7 @@ void TimerWatchdog::threadFunction() // function probably hanged - SWSS_LOG_WARN("timespan WD exceeded %ld ms", span/1000); - - auto callback = m_callback; - - if (callback) - callback(span); + SWSS_LOG_ERROR("time span WD exceeded %ld ms for %s", span/1000, m_eventName.c_str()); } continue; diff --git a/syncd/TimerWatchdog.h b/syncd/TimerWatchdog.h index 4599c247f5..1768da4ad2 100644 --- a/syncd/TimerWatchdog.h +++ b/syncd/TimerWatchdog.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace syncd { @@ -23,6 +24,9 @@ namespace syncd void setEndTime(); + void setEventName( + _In_ const std::string& eventName); + void setCallback( _In_ std::function callback); @@ -51,5 +55,9 @@ namespace syncd std::shared_ptr m_thread; std::function m_callback; + + std::string m_eventName; + + std::mutex m_mutex; }; } diff --git a/syncd/WatchdogScope.cpp b/syncd/WatchdogScope.cpp new file mode 100644 index 0000000000..1e16f77bff --- /dev/null +++ b/syncd/WatchdogScope.cpp @@ -0,0 +1,22 @@ +#include "WatchdogScope.h" + +using namespace syncd; + +WatchdogScope::WatchdogScope( + _In_ TimerWatchdog& tw, + _In_ const std::string eventName): + m_tw(tw) +{ + // SWSS_LOG_ENTER(); + + m_tw.setStartTime(); + + m_tw.setEventName(eventName); +} + +WatchdogScope::~WatchdogScope() +{ + // SWSS_LOG_ENTER(); + + m_tw.setEndTime(); +} diff --git a/syncd/WatchdogScope.h b/syncd/WatchdogScope.h new file mode 100644 index 0000000000..13c67d5d96 --- /dev/null +++ b/syncd/WatchdogScope.h @@ -0,0 +1,21 @@ +#pragma once + +#include "TimerWatchdog.h" + +namespace syncd +{ + class WatchdogScope + { + public: + + WatchdogScope( + _In_ TimerWatchdog& tw, + _In_ const std::string eventName); + + ~WatchdogScope(); + + private: + + TimerWatchdog& m_tw; + }; +} From e1bcde996ab6b49134b433ed142b17a0964e0a2d Mon Sep 17 00:00:00 2001 From: kcudnik Date: Sun, 25 Jul 2021 19:13:34 +0200 Subject: [PATCH 2/4] [syncd] Update Makefile --- syncd/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/syncd/Makefile.am b/syncd/Makefile.am index 1ae9315a35..3082d142b9 100644 --- a/syncd/Makefile.am +++ b/syncd/Makefile.am @@ -16,6 +16,7 @@ endif noinst_LIBRARIES = libSyncd.a libSyncdRequestShutdown.a libSyncd_a_SOURCES = \ + WatchdogScope.cpp \ SaiSwitchInterface.cpp \ ZeroMQSelectableChannel.cpp \ RedisSelectableChannel.cpp \ From 1784edbc4ad1bc2f28cb7433bdaf18c322855d87 Mon Sep 17 00:00:00 2001 From: kcudnik Date: Sun, 25 Jul 2021 19:17:28 +0200 Subject: [PATCH 3/4] [syncd] Disable log enter for performance --- syncd/WatchdogScope.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/syncd/WatchdogScope.cpp b/syncd/WatchdogScope.cpp index 1e16f77bff..2fa90bd504 100644 --- a/syncd/WatchdogScope.cpp +++ b/syncd/WatchdogScope.cpp @@ -7,7 +7,7 @@ WatchdogScope::WatchdogScope( _In_ const std::string eventName): m_tw(tw) { - // SWSS_LOG_ENTER(); + // SWSS_LOG_ENTER(); // disabled m_tw.setStartTime(); @@ -16,7 +16,7 @@ WatchdogScope::WatchdogScope( WatchdogScope::~WatchdogScope() { - // SWSS_LOG_ENTER(); + // SWSS_LOG_ENTER(); // disabled m_tw.setEndTime(); } From 478dd67177de18f42f66194294a311e4136208fd Mon Sep 17 00:00:00 2001 From: kcudnik Date: Sun, 25 Jul 2021 19:25:25 +0200 Subject: [PATCH 4/4] [syncd] Set watchdog timeout to 30 sec --- syncd/Syncd.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/syncd/Syncd.cpp b/syncd/Syncd.cpp index f9cd2f9bfd..06491e9c96 100644 --- a/syncd/Syncd.cpp +++ b/syncd/Syncd.cpp @@ -51,7 +51,7 @@ Syncd::Syncd( m_vendorSai(vendorSai), m_veryFirstRun(false), m_enableSyncMode(false), - m_timerWatchdog(10 * 1000000) // watch for executions over 30 seconds + m_timerWatchdog(30 * 1000000) // watch for executions over 30 seconds { SWSS_LOG_ENTER();