Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ neighsyncd/neighsyncd
portsyncd/portsyncd
orchagent/orchagent
orchagent/routeresync
orchagent/orchagent_restart_check
swssconfig/swssconfig
swssconfig/swssplayer
tests/tests
6 changes: 5 additions & 1 deletion orchagent/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ dist_swss_DATA = \
pfc_detect_barefoot.lua \
pfc_restore.lua

bin_PROGRAMS = orchagent routeresync
bin_PROGRAMS = orchagent routeresync orchagent_restart_check

if DEBUG
DBGFLAGS = -ggdb -DDEBUG
Expand Down Expand Up @@ -86,3 +86,7 @@ routeresync_SOURCES = routeresync.cpp
routeresync_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
routeresync_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
routeresync_LDADD = -lswsscommon

orchagent_restart_check_SOURCES = orchagent_restart_check.cpp
orchagent_restart_check_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON)
orchagent_restart_check_LDADD = -lhiredis -lswsscommon -lpthread
145 changes: 145 additions & 0 deletions orchagent/orchagent_restart_check.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#include <iostream>
#include <sstream>

#include <unistd.h>
#include <getopt.h>

#include "notificationproducer.h"
#include "notificationconsumer.h"
#include "select.h"
#include "logger.h"


void printUsage()
{
SWSS_LOG_ENTER();

std::cout << "Usage: orchagent_restart_check [-s] " << std::endl;
std::cout << " -n --noFreeze" << std::endl;
std::cout << " Don't freeze orchagent even if check succeeded" << std::endl;
std::cout << " -s --skipPendingTaskCheck" << std::endl;
std::cout << " Skip pending task dependency check for orchagent" << std::endl;
std::cout << " -w --waitTime" << std::endl;
std::cout << " Wait time for response from orchagent, in milliseconds. Default value: 1000" << std::endl;
std::cout << " -h --help:" << std::endl;
std::cout << " Print out this message" << std::endl;
}


/*
* Before stopping orchagent for warm restart, basic state check is preferred to
* ensure orchagent is not in transient state, so a deterministic state may be restored after restart.
*
* Here is to implement orchagent_restart_check binary which may talk to orchagent and
* ask it to do self-check, return "READY " signal and freeze if everything is ok,
* otherwise "NOT_READY" signal should be returned.
*
* Optionally:
* if --noFreeze option is provided, orchagent won't freeze.
* if --skipPendingTaskCheck option is provided, orchagent won't use
* whether there is pending task existing as state check criterion.
*/
int main(int argc, char **argv)
{
swss::Logger::getInstance().setMinPrio(swss::Logger::SWSS_INFO);
SWSS_LOG_ENTER();

std::string skipPendingTaskCheck = "fasle";
Copy link
Copy Markdown
Contributor

@qiluo-msft qiluo-msft Jun 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please fix the typo? @jipanyang

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. Fortunately it is not causing problem due to value "true" is checked
https://github.com/Azure/sonic-swss/blob/master/orchagent/switchorch.cpp#L179

std::string noFreeze = "fasle";
/* Default wait time is 1000 millisecond */
int waitTime = 1000;

const char* const optstring = "nsw:";
while(true)
{
static struct option long_options[] =
{
{ "noFreeze", no_argument, 0, 'n' },
{ "skipPendingTaskCheck", no_argument, 0, 's' },
{ "waitTime", required_argument, 0, 'w' }
};

int option_index = 0;

int c = getopt_long(argc, argv, optstring, long_options, &option_index);

if (c == -1)
{
break;
}

switch (c)
{
case 'n':
SWSS_LOG_NOTICE("Won't freeze orchagent even if check succeeded");
noFreeze = "true";
break;
case 's':
SWSS_LOG_NOTICE("Skipping pending task check for orchagent");
skipPendingTaskCheck = "true";
break;
case 'w':
SWSS_LOG_NOTICE("Wait time for response from orchagent set to %s milliseconds", optarg);
waitTime = atoi(optarg);
break;
case 'h':
printUsage();
exit(EXIT_SUCCESS);

case '?':
SWSS_LOG_WARN("unknown option %c", optopt);
printUsage();
exit(EXIT_FAILURE);

default:
SWSS_LOG_ERROR("getopt_long failure");
exit(EXIT_FAILURE);
}
}

swss::DBConnector db(APPL_DB, swss::DBConnector::DEFAULT_UNIXSOCKET, 0);
// Send warm restart query via "RESTARTCHECK" notification channel
swss::NotificationProducer restartQuery(&db, "RESTARTCHECK");
// Will listen for the reply on "RESTARTCHECKREPLY" channel
swss::NotificationConsumer restartQueryReply(&db, "RESTARTCHECKREPLY");

std::vector<swss::FieldValueTuple> values;
values.emplace_back("NoFreeze", noFreeze);
values.emplace_back("SkipPendingTaskCheck", skipPendingTaskCheck);
std::string op = "orchagent";
SWSS_LOG_NOTICE("requested %s to do warm restart state check", op.c_str());
restartQuery.send(op, op, values);


swss::Select s;
s.addSelectable(&restartQueryReply);
swss::Selectable *sel;
std::string op_ret, data;
values.clear();
int result = s.select(&sel, waitTime);
if (result == swss::Select::OBJECT)
{
restartQueryReply.pop(op_ret, data, values);
if (data == "READY")
{
SWSS_LOG_NOTICE("RESTARTCHECK success, %s is frozen and ready for warm restart", op_ret.c_str());
std::cout << "RESTARTCHECK succeeded" << std::endl;
return EXIT_SUCCESS;
}
else
{
SWSS_LOG_NOTICE("RESTARTCHECK failed, %s is not ready for warm restart with status %s",
op_ret.c_str(), data.c_str());
}
}
else if (result == swss::Select::TIMEOUT)
{
SWSS_LOG_NOTICE("RESTARTCHECK for %s timed out", op_ret.c_str());
}
else
{
SWSS_LOG_NOTICE("RESTARTCHECK for %s error", op_ret.c_str());
}
std::cout << "RESTARTCHECK failed" << std::endl;
return EXIT_FAILURE;
}
60 changes: 60 additions & 0 deletions orchagent/orchdaemon.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <unistd.h>
#include <unordered_map>
#include <limits.h>
#include "orchdaemon.h"
#include "logger.h"
#include <sairedis.h>
Expand Down Expand Up @@ -343,6 +344,26 @@ void OrchDaemon::start()
* is a good chance to flush the pipeline before next select happened.
*/
flush();

/*
* Asked to check warm restart readiness.
* Not doing this under Select::TIMEOUT condition because of
* the existence of finer granularity ExecutableTimer with select
*/
if (gSwitchOrch->checkRestartReady())
{
bool ret = warmRestartCheck();
if (ret)
{
// Orchagent is ready to perform warm restart, stop processing any new db data.
// Should sleep here or continue handling timers and etc.??
if (!gSwitchOrch->checkRestartNoFreeze())
{
SWSS_LOG_WARN("Orchagent is frozen for warm restart!");
sleep(UINT_MAX);
}
}
}
}
}

Expand Down Expand Up @@ -435,3 +456,42 @@ bool OrchDaemon::warmRestoreValidation()
WarmStart::setWarmStartState("orchagent", WarmStart::RESTORED);
return true;
}

/*
* Reply with "READY" notification if no pending tasks, and return true.
* Ortherwise reply with "NOT_READY" notification and return false.
* Further consideration is needed as to when orchagent is treated as warm restart ready.
* For now, no pending task should exist in any orch agent.
*/
bool OrchDaemon::warmRestartCheck()
{
std::vector<swss::FieldValueTuple> values;
std::string op = "orchagent";
std::string data = "READY";
bool ret = true;

vector<string> ts;
getTaskToSync(ts);

if (ts.size() != 0)
{
SWSS_LOG_NOTICE("WarmRestart check found pending tasks: ");
for(auto &s : ts)
{
SWSS_LOG_NOTICE(" %s", s.c_str());
}
if (!gSwitchOrch->skipPendingTaskCheck())
{
data = "NOT_READY";
ret = false;
}
else
{
SWSS_LOG_NOTICE("Orchagent objects dependency check skipped");
}
}

SWSS_LOG_NOTICE("Restart check result: %s", data.c_str());
gSwitchOrch->restartCheckReply(op, data, values);
return ret;
}
2 changes: 2 additions & 0 deletions orchagent/orchdaemon.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ class OrchDaemon
bool warmRestoreAndSyncUp();
void getTaskToSync(vector<string> &ts);
bool warmRestoreValidation();

bool warmRestartCheck();
private:
DBConnector *m_applDb;
DBConnector *m_configDb;
Expand Down
56 changes: 55 additions & 1 deletion orchagent/switchorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

#include "switchorch.h"
#include "converter.h"
#include "notifier.h"
#include "notificationproducer.h"

using namespace std;
using namespace swss;
Expand All @@ -27,8 +29,12 @@ const map<string, sai_packet_action_t> packet_action_map =
};

SwitchOrch::SwitchOrch(DBConnector *db, string tableName) :
Orch(db, tableName)
Orch(db, tableName),
m_db(db)
{
m_restartCheckNotificationConsumer = new NotificationConsumer(db, "RESTARTCHECK");
auto restartCheckNotifier = new Notifier(m_restartCheckNotificationConsumer, this, "RESTARTCHECK");
Orch::addExecutor(restartCheckNotifier);
}

void SwitchOrch::doTask(Consumer &consumer)
Expand Down Expand Up @@ -122,3 +128,51 @@ void SwitchOrch::doTask(Consumer &consumer)
}
}

void SwitchOrch::doTask(NotificationConsumer& consumer)
{
SWSS_LOG_ENTER();

std::string op;
std::string data;
std::vector<swss::FieldValueTuple> values;

consumer.pop(op, data, values);

if (&consumer != m_restartCheckNotificationConsumer)
{
return;
}

m_warmRestartCheck.checkRestartReadyState = false;
m_warmRestartCheck.noFreeze = false;
m_warmRestartCheck.skipPendingTaskCheck = false;

SWSS_LOG_NOTICE("RESTARTCHECK notification for %s ", op.c_str());
if (op == "orchagent")
{
string s = op;

m_warmRestartCheck.checkRestartReadyState = true;
for (auto &i : values)
{
s += "|" + fvField(i) + ":" + fvValue(i);

if (fvField(i) == "NoFreeze" && fvValue(i) == "true")
{
m_warmRestartCheck.noFreeze = true;
}
if (fvField(i) == "SkipPendingTaskCheck" && fvValue(i) == "true")
{
m_warmRestartCheck.skipPendingTaskCheck = true;
}
}
SWSS_LOG_NOTICE("%s", s.c_str());
}
}

void SwitchOrch::restartCheckReply(const string &op, const string &data, std::vector<FieldValueTuple> &values)
{
NotificationProducer restartRequestReply(m_db, "RESTARTCHECKREPLY");
restartRequestReply.send(op, data, values);
checkRestartReadyDone();
}
20 changes: 20 additions & 0 deletions orchagent/switchorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,31 @@

#include "orch.h"

struct WarmRestartCheck
{
bool checkRestartReadyState;
bool noFreeze;
bool skipPendingTaskCheck;
};

class SwitchOrch : public Orch
{
public:
SwitchOrch(DBConnector *db, string tableName);

bool checkRestartReady() { return m_warmRestartCheck.checkRestartReadyState; }
bool checkRestartNoFreeze() { return m_warmRestartCheck.noFreeze; }
bool skipPendingTaskCheck() { return m_warmRestartCheck.skipPendingTaskCheck; }
void checkRestartReadyDone() { m_warmRestartCheck.checkRestartReadyState = false; }
void restartCheckReply(const string &op, const string &data, std::vector<FieldValueTuple> &values);
private:
void doTask(Consumer &consumer);

NotificationConsumer* m_restartCheckNotificationConsumer;
void doTask(NotificationConsumer& consumer);
DBConnector *m_db;

// Information contained in the request from
// external program for orchagent pre-shutdown state check
WarmRestartCheck m_warmRestartCheck = {false, false, false};
};
Loading