Skip to content

Commit 10c5bd1

Browse files
committed
Add orchagent pre-warm-restart check mechanism
Signed-off-by: Jipan Yang <[email protected]>
1 parent c9ed2c4 commit 10c5bd1

File tree

10 files changed

+266
-21
lines changed

10 files changed

+266
-21
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ neighsyncd/neighsyncd
5252
portsyncd/portsyncd
5353
orchagent/orchagent
5454
orchagent/routeresync
55+
orchagent/orchagent_restart_check
5556
swssconfig/swssconfig
5657
swssconfig/swssplayer
5758
tests/tests

orchagent/Makefile.am

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ dist_swss_DATA = \
1010
pfc_detect_barefoot.lua \
1111
pfc_restore.lua
1212

13-
bin_PROGRAMS = orchagent routeresync
13+
bin_PROGRAMS = orchagent routeresync orchagent_restart_check
1414

1515
if DEBUG
1616
DBGFLAGS = -ggdb -DDEBUG
@@ -81,3 +81,7 @@ routeresync_SOURCES = routeresync.cpp
8181
routeresync_CFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
8282
routeresync_CPPFLAGS = $(DBGFLAGS) $(AM_CFLAGS) $(CFLAGS_COMMON)
8383
routeresync_LDADD = -lswsscommon
84+
85+
orchagent_restart_check_SOURCES = orchagent_restart_check.cpp
86+
orchagent_restart_check_CPPFLAGS = $(DBGFLAGS) $(AM_CPPFLAGS) $(CFLAGS_COMMON)
87+
orchagent_restart_check_LDADD = -lhiredis -lswsscommon -lpthread

orchagent/orch.cpp

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,30 @@ void Consumer::drain()
185185
m_orch->doTask(*this);
186186
}
187187

188+
string Consumer::dumpTuple(KeyOpFieldsValuesTuple &tuple)
189+
{
190+
string s = getTableName() + getConsumerTable()->getTableNameSeparator() + kfvKey(tuple)
191+
+ "|" + kfvOp(tuple);
192+
for (auto i = kfvFieldsValues(tuple).begin(); i != kfvFieldsValues(tuple).end(); i++)
193+
{
194+
s += "|" + fvField(*i) + ":" + fvValue(*i);
195+
}
196+
197+
return s;
198+
}
199+
200+
void Consumer::dumpToSyncTasks(vector<string> &ts)
201+
{
202+
for (auto &tm :m_toSync)
203+
{
204+
KeyOpFieldsValuesTuple& tuple = tm.second;
205+
206+
string s = dumpTuple(tuple);
207+
208+
ts.push_back(s);
209+
}
210+
}
211+
188212
size_t Orch::addExistingData(const string& tableName)
189213
{
190214
auto consumer = dynamic_cast<Consumer *>(getExecutor(tableName));
@@ -224,7 +248,7 @@ bool Orch::bake()
224248
{
225249
continue;
226250
}
227-
251+
228252
size_t refilled = consumer->refillToSync();
229253
SWSS_LOG_NOTICE("Add warm input: %s, %zd", executorName.c_str(), refilled);
230254
}
@@ -326,6 +350,21 @@ void Orch::doTask()
326350
}
327351
}
328352

353+
void Orch::dumpToSyncTasks(vector<string> &ts)
354+
{
355+
for(auto &it : m_consumerMap)
356+
{
357+
Consumer* consumer = dynamic_cast<Consumer *>(it.second.get());
358+
if (consumer == NULL)
359+
{
360+
SWSS_LOG_DEBUG("Executor is not a Consumer");
361+
continue;
362+
}
363+
364+
consumer->dumpToSyncTasks(ts);
365+
}
366+
}
367+
329368
void Orch::logfileReopen()
330369
{
331370
gRecordOfs.close();
@@ -347,12 +386,7 @@ void Orch::logfileReopen()
347386

348387
void Orch::recordTuple(Consumer &consumer, KeyOpFieldsValuesTuple &tuple)
349388
{
350-
string s = consumer.getTableName() + ":" + kfvKey(tuple)
351-
+ "|" + kfvOp(tuple);
352-
for (auto i = kfvFieldsValues(tuple).begin(); i != kfvFieldsValues(tuple).end(); i++)
353-
{
354-
s += "|" + fvField(*i) + ":" + fvValue(*i);
355-
}
389+
string s = consumer.dumpTuple(tuple);
356390

357391
gRecordOfs << getTimestamp() << "|" << s << endl;
358392

@@ -366,13 +400,7 @@ void Orch::recordTuple(Consumer &consumer, KeyOpFieldsValuesTuple &tuple)
366400

367401
string Orch::dumpTuple(Consumer &consumer, KeyOpFieldsValuesTuple &tuple)
368402
{
369-
string s = consumer.getTableName() + ":" + kfvKey(tuple)
370-
+ "|" + kfvOp(tuple);
371-
for (auto i = kfvFieldsValues(tuple).begin(); i != kfvFieldsValues(tuple).end(); i++)
372-
{
373-
s += "|" + fvField(*i) + ":" + fvValue(*i);
374-
}
375-
403+
string s = consumer.dumpTuple(tuple);
376404
return s;
377405
}
378406

orchagent/orch.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,9 @@ class Consumer : public Executor {
116116
return getConsumerTable()->getTableName();
117117
}
118118

119+
string dumpTuple(KeyOpFieldsValuesTuple &tuple);
120+
void dumpToSyncTasks(vector<string> &ts);
121+
119122
size_t refillToSync();
120123
size_t refillToSync(Table* table);
121124
void execute();
@@ -124,7 +127,7 @@ class Consumer : public Executor {
124127
/* Store the latest 'golden' status */
125128
// TODO: hide?
126129
SyncMap m_toSync;
127-
130+
128131
protected:
129132
// Returns: the number of entries added to m_toSync
130133
size_t addToSync(std::deque<KeyOpFieldsValuesTuple> &entries);
@@ -162,7 +165,7 @@ class Orch
162165
// Prepare for warm start if Redis contains valid input data
163166
// otherwise fallback to cold start
164167
virtual bool bake();
165-
168+
166169
/* Iterate all consumers in m_consumerMap and run doTask(Consumer) */
167170
void doTask();
168171

@@ -173,6 +176,8 @@ class Orch
173176

174177
/* TODO: refactor recording */
175178
static void recordTuple(Consumer &consumer, KeyOpFieldsValuesTuple &tuple);
179+
180+
void dumpToSyncTasks(vector<string> &ts);
176181
protected:
177182
ConsumerMap m_consumerMap;
178183

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#include <iostream>
2+
#include <sstream>
3+
4+
#include <unistd.h>
5+
#include <getopt.h>
6+
7+
#include "notificationproducer.h"
8+
#include "notificationconsumer.h"
9+
#include "select.h"
10+
#include "logger.h"
11+
12+
int main(int argc, char **argv)
13+
{
14+
swss::Logger::getInstance().setMinPrio(swss::Logger::SWSS_NOTICE);
15+
SWSS_LOG_ENTER();
16+
17+
std::string op = "orchagent";
18+
19+
swss::DBConnector db(APPL_DB, swss::DBConnector::DEFAULT_UNIXSOCKET, 0);
20+
swss::NotificationProducer restartQuery(&db, "RESTARTCHECK");
21+
22+
swss::NotificationConsumer restartQueryReply(&db, "RESTARTCHECKREPLY");
23+
swss::Select s;
24+
s.addSelectable(&restartQueryReply);
25+
swss::Selectable *sel;
26+
27+
std::vector<swss::FieldValueTuple> values;
28+
SWSS_LOG_NOTICE("requested %s to do warm restart state check and freeze if ready", op.c_str());
29+
restartQuery.send(op, op, values);
30+
31+
int result = s.select(&sel, 3000);
32+
if (result == swss::Select::OBJECT)
33+
{
34+
std::string op_ret, data;
35+
36+
restartQueryReply.pop(op_ret, data, values);
37+
if (op_ret == "READY")
38+
{
39+
SWSS_LOG_NOTICE("RESTARTCHECK success, %s is frozen and ready for warm restart", op.c_str());
40+
std::cout << "RESTARTCHECK succeeded" << std::endl;
41+
return EXIT_SUCCESS;
42+
}
43+
else
44+
{
45+
SWSS_LOG_NOTICE("RESTARTCHECK failed, %s is not ready for warm restart", op.c_str());
46+
}
47+
}
48+
else if (result == swss::Select::TIMEOUT)
49+
{
50+
SWSS_LOG_NOTICE("RESTARTCHECK for %s timed out", op.c_str());
51+
}
52+
else
53+
{
54+
SWSS_LOG_NOTICE("RESTARTCHECK for %s error", op.c_str());
55+
}
56+
std::cout << "RESTARTCHECK failed" << std::endl;
57+
return EXIT_FAILURE;
58+
}

orchagent/orchdaemon.cpp

Lines changed: 69 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include "orchdaemon.h"
44
#include "logger.h"
55
#include <sairedis.h>
6+
#include <limits.h>
7+
#include "notificationproducer.h"
68

79
#define SAI_SWITCH_ATTR_CUSTOM_RANGE_BASE SAI_SWITCH_ATTR_CUSTOM_RANGE_START
810
#include "sairedis.h"
@@ -27,6 +29,7 @@ RouteOrch *gRouteOrch;
2729
AclOrch *gAclOrch;
2830
CrmOrch *gCrmOrch;
2931
BufferOrch *gBufferOrch;
32+
SwitchOrch *gSwitchOrch;
3033

3134
OrchDaemon::OrchDaemon(DBConnector *applDb, DBConnector *configDb, DBConnector *stateDb) :
3235
m_applDb(applDb),
@@ -49,7 +52,7 @@ bool OrchDaemon::init()
4952

5053
string platform = getenv("platform") ? getenv("platform") : "";
5154

52-
SwitchOrch *switch_orch = new SwitchOrch(m_applDb, APP_SWITCH_TABLE_NAME);
55+
gSwitchOrch = new SwitchOrch(m_applDb, APP_SWITCH_TABLE_NAME);
5356

5457
const int portsorch_base_pri = 40;
5558

@@ -116,7 +119,7 @@ bool OrchDaemon::init()
116119
CFG_DTEL_EVENT_TABLE_NAME
117120
};
118121

119-
m_orchList = { switch_orch, gCrmOrch, gBufferOrch, gPortsOrch, intfs_orch, gNeighOrch, gRouteOrch, copp_orch, tunnel_decap_orch, qos_orch, mirror_orch };
122+
m_orchList = { gSwitchOrch, gCrmOrch, gBufferOrch, gPortsOrch, intfs_orch, gNeighOrch, gRouteOrch, copp_orch, tunnel_decap_orch, qos_orch, mirror_orch };
120123

121124
bool initialize_dtel = false;
122125
if (platform == BFN_PLATFORM_SUBSTRING || platform == VS_PLATFORM_SUBSTRING)
@@ -320,5 +323,69 @@ void OrchDaemon::start()
320323
* is a good chance to flush the pipeline before next select happened.
321324
*/
322325
flush();
326+
327+
/*
328+
* Asked to check warm restart readiness.
329+
* Not doing this under Select::TIMEOUT condition because of
330+
* the existence of finer granularity ExecutableTimer with select
331+
*/
332+
if (gSwitchOrch->checkRestartReady())
333+
{
334+
bool ret = warmRestartCheckReply();
335+
if (ret)
336+
{
337+
// Orchagent is ready to perform warm restart, stop processing any new db data.
338+
// Should sleep here or continue handling timers and etc.??
339+
SWSS_LOG_WARN("Orchagent is frozen for warm restart!");
340+
sleep(UINT_MAX);
341+
}
342+
}
343+
344+
}
345+
}
346+
347+
/*
348+
* Get tasks to sync for consumers of each orch being managed by this orch daemon
349+
*/
350+
void OrchDaemon::getTaskToSync(vector<string> &ts)
351+
{
352+
for (Orch *o : m_orchList)
353+
{
354+
o->dumpToSyncTasks(ts);
323355
}
324356
}
357+
358+
/*
359+
* Reply with "READY" notification if no pending tasks, and return true.
360+
* Ortherwise reply with "NOT_READY" notification and return false.
361+
* Further consideration is needed as to when orchagent is treated as warm restart ready.
362+
* For now, no pending task should exist in any orch agent.
363+
*/
364+
bool OrchDaemon::warmRestartCheckReply()
365+
{
366+
NotificationProducer restartRequestReply(m_applDb, "RESTARTCHECKREPLY");
367+
std::vector<swss::FieldValueTuple> values;
368+
std::string op = "READY";
369+
bool ret = true;
370+
371+
vector<string> ts;
372+
getTaskToSync(ts);
373+
374+
if (ts.size() != 0)
375+
{
376+
SWSS_LOG_ERROR("WarmRestart not ready with pending tasks: ");
377+
for(auto &s : ts)
378+
{
379+
SWSS_LOG_NOTICE("%s", s.c_str());
380+
}
381+
op = "NOT_READY";
382+
ret = false;
383+
}
384+
385+
SWSS_LOG_NOTICE("Restart check result: %s", op.c_str());
386+
387+
restartRequestReply.send(op, op, values);
388+
gSwitchOrch->checkRestartReadyDone();
389+
return ret;
390+
}
391+

orchagent/orchdaemon.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ class OrchDaemon
3434

3535
bool init();
3636
void start();
37+
void getTaskToSync(vector<string> &ts);
38+
bool warmRestartCheckReply();
3739
private:
3840
DBConnector *m_applDb;
3941
DBConnector *m_configDb;

orchagent/switchorch.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "switchorch.h"
44
#include "converter.h"
5+
#include "notifier.h"
56

67
using namespace std;
78
using namespace swss;
@@ -27,8 +28,13 @@ const map<string, sai_packet_action_t> packet_action_map =
2728
};
2829

2930
SwitchOrch::SwitchOrch(DBConnector *db, string tableName) :
30-
Orch(db, tableName)
31+
Orch(db, tableName),
32+
m_db(db)
3133
{
34+
m_restartCheckNotificationConsumer = new NotificationConsumer(db, "RESTARTCHECK");
35+
auto restartCheckNotifier = new Notifier(m_restartCheckNotificationConsumer, this);
36+
// restartCheckNotifier->setName("RESTARTCHECK");
37+
Orch::addExecutor("RESTARTCHECK", restartCheckNotifier);
3238
}
3339

3440
void SwitchOrch::doTask(Consumer &consumer)
@@ -122,3 +128,25 @@ void SwitchOrch::doTask(Consumer &consumer)
122128
}
123129
}
124130

131+
void SwitchOrch::doTask(NotificationConsumer& consumer)
132+
{
133+
SWSS_LOG_ENTER();
134+
135+
std::string op;
136+
std::string data;
137+
std::vector<swss::FieldValueTuple> values;
138+
139+
consumer.pop(op, data, values);
140+
141+
if (&consumer != m_restartCheckNotificationConsumer)
142+
{
143+
return;
144+
}
145+
146+
SWSS_LOG_NOTICE("RESTARTCHECK notification for %s ", op.c_str());
147+
if (op == "orchagent")
148+
{
149+
checkRestartReadyState = true;
150+
}
151+
}
152+

orchagent/switchorch.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,14 @@ class SwitchOrch : public Orch
66
{
77
public:
88
SwitchOrch(DBConnector *db, string tableName);
9-
9+
bool checkRestartReady() { return checkRestartReadyState; }
10+
void checkRestartReadyDone() { checkRestartReadyState = false; }
1011
private:
1112
void doTask(Consumer &consumer);
13+
14+
NotificationConsumer* m_restartCheckNotificationConsumer;
15+
void doTask(NotificationConsumer& consumer);
16+
// Whether to check readiness of warm restart
17+
bool checkRestartReadyState = false;
18+
DBConnector *m_db;
1219
};

0 commit comments

Comments
 (0)