-
Notifications
You must be signed in to change notification settings - Fork 694
Added changes to handle dependency check in FdbSyncd and FpmSyncd for warm-boot #1556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
5530e69
cd04985
12d940a
d56fa46
b5c0e7c
486d26a
421f8e6
7fa56be
a3a9d04
a951e6b
a6b5521
8fbd131
787b1b8
08a3908
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,6 +43,66 @@ FdbSync::~FdbSync() | |
| } | ||
| } | ||
|
|
||
|
|
||
| // Check if interface entries are restored in kernel | ||
| bool FdbSync::isIntfRestoreDone() | ||
| { | ||
| vector<string> required_modules = { | ||
| "vxlanmgrd", | ||
| "intfmgrd", | ||
| "vlanmgrd", | ||
| "vrfmgrd" | ||
| }; | ||
|
|
||
| for(string& module : required_modules) | ||
|
||
| { | ||
| WarmStart::WarmStartState state; | ||
|
|
||
| WarmStart::getWarmStartState(module, state); | ||
| if(state == WarmStart::REPLAYED || state == WarmStart::RECONCILED) | ||
|
||
| { | ||
| SWSS_LOG_INFO("Module %s Replayed or Reconciled %d",module.c_str(), (int) state); | ||
| } | ||
| else | ||
| { | ||
| SWSS_LOG_INFO("Module %s NOT Replayed or Reconciled %d",module.c_str(), (int) state); | ||
| //return false; | ||
|
||
| // Return true till all the dependant code is checked in | ||
| return true; | ||
|
||
| } | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| // Check if vxlan entries are re-conciled to kernel | ||
| bool FdbSync::isReadyToReconcile() | ||
| { | ||
| vector<string> required_modules = { | ||
| "orchagent", | ||
| }; | ||
|
|
||
| for(string& module : required_modules) | ||
|
||
| { | ||
| WarmStart::WarmStartState state; | ||
|
|
||
| WarmStart::getWarmStartState(module, state); | ||
| if(state == WarmStart::RECONCILED) | ||
|
||
| { | ||
| SWSS_LOG_INFO("Module %s Reconciled %d",module.c_str(), (int) state); | ||
| } | ||
| else | ||
| { | ||
| SWSS_LOG_INFO("Module %s NOT Reconciled %d",module.c_str(), (int) state); | ||
| //return false; | ||
|
||
| // Return True untill the dependant orchagent code is commited | ||
| return true; | ||
| } | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| void FdbSync::processCfgEvpnNvo() | ||
| { | ||
| std::deque<KeyOpFieldsValuesTuple> entries; | ||
|
|
@@ -447,14 +507,17 @@ void FdbSync::macDelVxlanDB(string key) | |
| fvVector.push_back(t); | ||
| fvVector.push_back(v); | ||
|
|
||
| SWSS_LOG_NOTICE("%sVXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s", | ||
| m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" , | ||
| key.c_str(), vtep.c_str(), type.c_str()); | ||
|
|
||
| // If warmstart is in progress, we take all netlink changes into the cache map | ||
| if (m_AppRestartAssist->isWarmStartInProgress()) | ||
| { | ||
| m_AppRestartAssist->insertToMap(APP_VXLAN_FDB_TABLE_NAME, key, fvVector, true); | ||
| return; | ||
| } | ||
|
|
||
| SWSS_LOG_INFO("VXLAN_FDB_TABLE: DEL_KEY %s vtep:%s type:%s", key.c_str(), vtep.c_str(), type.c_str()); | ||
| m_fdbTable.del(key); | ||
| return; | ||
|
|
||
|
|
@@ -476,14 +539,16 @@ void FdbSync::macAddVxlan(string key, struct in_addr vtep, string type, uint32_t | |
| fvVector.push_back(t); | ||
| fvVector.push_back(v); | ||
|
|
||
| SWSS_LOG_INFO("%sVXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s", | ||
| m_AppRestartAssist->isWarmStartInProgress() ? "WARM-RESTART:" : "" , | ||
| key.c_str(), svtep.c_str(), type.c_str()); | ||
| // If warmstart is in progress, we take all netlink changes into the cache map | ||
| if (m_AppRestartAssist->isWarmStartInProgress()) | ||
| { | ||
| m_AppRestartAssist->insertToMap(APP_VXLAN_FDB_TABLE_NAME, key, fvVector, false); | ||
| return; | ||
| } | ||
|
|
||
| SWSS_LOG_INFO("VXLAN_FDB_TABLE: ADD_KEY %s vtep:%s type:%s", key.c_str(), svtep.c_str(), type.c_str()); | ||
| m_fdbTable.set(key, fvVector); | ||
|
|
||
| return; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,7 +10,16 @@ | |
| #include "warmRestartAssist.h" | ||
|
|
||
| // The timeout value (in seconds) for fdbsyncd reconcilation logic | ||
| #define DEFAULT_FDBSYNC_WARMSTART_TIMER 30 | ||
| #define DEFAULT_FDBSYNC_WARMSTART_TIMER 600 | ||
|
||
|
|
||
| // Time to wait to run fdb reconcillation after fdbsyncd replay | ||
| #define FDBSYNC_RECON_TIMER 120 | ||
|
||
| /* | ||
| * This is the MAX time in seconds, fdbsyncd will wait after warm-reboot | ||
| * for the interface entries to be recreated in kernel before attempting to | ||
| * write the FDB data to kernel | ||
| */ | ||
| #define WARM_RESTORE_WAIT_TIME_OUT_MAX 180 | ||
|
||
|
|
||
| namespace swss { | ||
|
|
||
|
|
@@ -43,7 +52,8 @@ class FdbSync : public NetMsg | |
|
|
||
| virtual void onMsg(int nlmsg_type, struct nl_object *obj); | ||
|
|
||
| bool isFdbRestoreDone(); | ||
| bool isIntfRestoreDone(); | ||
| bool isReadyToReconcile(); | ||
|
|
||
| AppRestartAssist *getRestartAssist() | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,8 @@ int main(int argc, char **argv) | |
| Selectable *temps; | ||
| int ret; | ||
| Select s; | ||
| SelectableTimer replayCheckTimer(timespec{0, 0}); | ||
| SelectableTimer reconCheckTimer(timespec{0, 0}); | ||
|
|
||
| using namespace std::chrono; | ||
|
|
||
|
|
@@ -44,8 +46,28 @@ int main(int argc, char **argv) | |
| */ | ||
| if (sync.getRestartAssist()->isWarmStartInProgress()) | ||
| { | ||
| int pasttime = 0; | ||
|
||
| sync.getRestartAssist()->readTablesToMap(); | ||
|
|
||
| while (!sync.isIntfRestoreDone()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
CPU is wasted on waiting. Could you subscribe Redis? #WontFix
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As this is not a continuous busy wait ( sleep is present), this should not cause the cpu to be continuously busy. Also there is nothing for fdbsyncd to do until the interface info is populated to kernel after system warm-reboot, hence it needs to wait till such time. |
||
| { | ||
| if (pasttime++ > WARM_RESTORE_WAIT_TIME_OUT_MAX) | ||
| { | ||
| SWSS_LOG_INFO("timed-out before all interface data was replayed to kernel!!!"); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What happens if intf is not restored after max_wait_time? Shouldn't we abort to avoid more issues?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. System will proceed further. Some mac programming to kernel might fail because underlying interface is not yet created.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we could not restore interface, why we should proceed further and get into some limbo state that may or may not have critical issues. I would suggest we abort to bring user's attention.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interfaces will be eventually restored. The only impact will be that warm-reboot might not be hitless and there will be traffic loss seen. Not sure if we need to go for full abort and impact everything and all traffic. Requesting @prsunny to comment on this too
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "Some mac programming to kernel might fail because underlying interface is not yet created." so this condition will be recovered by someone later? Again, if it is a critical condition, we should raise/abort so we don't get into limbo state. |
||
| break; | ||
| } | ||
| sleep(1); | ||
| } | ||
| SWSS_LOG_NOTICE("Starting ReconcileTimer"); | ||
| sync.getRestartAssist()->startReconcileTimer(s); | ||
| replayCheckTimer.setInterval(timespec{1, 0}); | ||
| replayCheckTimer.start(); | ||
| s.addSelectable(&replayCheckTimer); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove extra blank line #Closed
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will do |
||
| else | ||
| { | ||
| sync.getRestartAssist()->warmStartDisabled(); | ||
| sync.m_reconcileDone = true; | ||
| } | ||
|
|
||
| netlink.registerGroup(RTNLGRP_LINK); | ||
|
|
@@ -67,14 +89,51 @@ int main(int argc, char **argv) | |
| { | ||
| s.select(&temps); | ||
|
|
||
| if(temps == (Selectable *)sync.getFdbStateTable()) | ||
| if (temps == (Selectable *)sync.getFdbStateTable()) | ||
| { | ||
| sync.processStateFdb(); | ||
| } | ||
| else if (temps == (Selectable *)sync.getCfgEvpnNvoTable()) | ||
| { | ||
| sync.processCfgEvpnNvo(); | ||
| } | ||
| else if (temps == &replayCheckTimer) | ||
| { | ||
| if (sync.getFdbStateTable()->empty() && sync.getCfgEvpnNvoTable()->empty()) | ||
| { | ||
| sync.getRestartAssist()->appDataReplayed(); | ||
| SWSS_LOG_NOTICE("FDB Replay Complete, Start Reconcile Check"); | ||
| reconCheckTimer.setInterval(timespec{FDBSYNC_RECON_TIMER, 0}); | ||
| reconCheckTimer.start(); | ||
| s.addSelectable(&reconCheckTimer); | ||
| } | ||
| else | ||
| { | ||
| replayCheckTimer.setInterval(timespec{1, 0}); | ||
| // re-start replay check timer | ||
| replayCheckTimer.start(); | ||
| } | ||
|
|
||
| } | ||
| else if (temps == &reconCheckTimer) | ||
| { | ||
| if (sync.isReadyToReconcile()) | ||
| { | ||
| if (sync.getRestartAssist()->isWarmStartInProgress()) | ||
| { | ||
| sync.m_reconcileDone = true; | ||
| sync.getRestartAssist()->stopReconcileTimer(s); | ||
| sync.getRestartAssist()->reconcile(); | ||
| SWSS_LOG_NOTICE("VXLAN FDB VNI Reconcillation Complete (Event)"); | ||
| } | ||
| } | ||
| else | ||
| { | ||
| reconCheckTimer.setInterval(timespec{1, 0}); | ||
| // Restart and check again in 1 sec | ||
| reconCheckTimer.start(); | ||
| } | ||
| } | ||
| else | ||
| { | ||
| /* | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,9 @@ using namespace swss; | |
| * Default warm-restart timer interval for routing-stack app. To be used only if | ||
| * no explicit value has been defined in configuration. | ||
| */ | ||
| const uint32_t DEFAULT_ROUTING_RESTART_INTERVAL = 120; | ||
| const uint32_t DEFAULT_ROUTING_RESTART_INTERVAL = 600; | ||
|
||
|
|
||
| const uint32_t DEFAULT_ROUTING_RECON_CHECK_INTERVAL = 120; | ||
|
|
||
| // Wait 3 seconds after detecting EOIU reached state | ||
| // TODO: support eoiu hold interval config | ||
|
|
@@ -67,6 +69,10 @@ int main(int argc, char **argv) | |
| SelectableTimer eoiuCheckTimer(timespec{0, 0}); | ||
| // After eoiu flags are detected, start a hold timer before starting reconciliation. | ||
| SelectableTimer eoiuHoldTimer(timespec{0, 0}); | ||
|
|
||
| // Timers to wait for dependent reconcillation | ||
| SelectableTimer reconcileCheckTimer(timespec{0, 0}); | ||
| SelectableTimer reconcileHoldTimer(timespec{0, 0}); | ||
| /* | ||
| * Pipeline should be flushed right away to deal with state pending | ||
| * from previous try/catch iterations. | ||
|
|
@@ -85,13 +91,14 @@ int main(int argc, char **argv) | |
| { | ||
| /* Obtain warm-restart timer defined for routing application */ | ||
| time_t warmRestartIval = sync.m_warmStartHelper.getRestartTimer(); | ||
| warmStartTimer.setInterval(timespec{DEFAULT_ROUTING_RESTART_INTERVAL, 0}); | ||
| if (!warmRestartIval) | ||
| { | ||
| warmStartTimer.setInterval(timespec{DEFAULT_ROUTING_RESTART_INTERVAL, 0}); | ||
| reconcileCheckTimer.setInterval(timespec{DEFAULT_ROUTING_RECON_CHECK_INTERVAL, 0}); | ||
| } | ||
| else | ||
| { | ||
| warmStartTimer.setInterval(timespec{warmRestartIval, 0}); | ||
| reconcileCheckTimer.setInterval(timespec{warmRestartIval, 0}); | ||
| } | ||
|
|
||
| /* Execute restoration instruction and kick off warm-restart timer */ | ||
|
|
@@ -100,6 +107,9 @@ int main(int argc, char **argv) | |
| warmStartTimer.start(); | ||
| s.addSelectable(&warmStartTimer); | ||
| SWSS_LOG_NOTICE("Warm-Restart timer started."); | ||
| reconcileCheckTimer.start(); | ||
| s.addSelectable(&reconcileCheckTimer); | ||
| SWSS_LOG_NOTICE("reconcileCheckTimer timer started "); | ||
| } | ||
|
|
||
| // Also start periodic eoiu check timer, first wait 5 seconds, then check every 1 second | ||
|
|
@@ -108,6 +118,10 @@ int main(int argc, char **argv) | |
| s.addSelectable(&eoiuCheckTimer); | ||
| SWSS_LOG_NOTICE("Warm-Restart eoiuCheckTimer timer started."); | ||
| } | ||
| else | ||
| { | ||
| sync.m_warmStartHelper.setState(WarmStart::WSDISABLED); | ||
| } | ||
|
|
||
| while (true) | ||
| { | ||
|
|
@@ -122,17 +136,26 @@ int main(int argc, char **argv) | |
| * select() loop. | ||
| * Note: route reconciliation always succeeds, it will not be done twice. | ||
| */ | ||
| if (temps == &warmStartTimer || temps == &eoiuHoldTimer) | ||
| if (temps == &warmStartTimer || temps == &eoiuHoldTimer || temps == &reconcileHoldTimer) | ||
| { | ||
| bool readyToReconcile = false; | ||
| if (temps == &warmStartTimer) | ||
| { | ||
| readyToReconcile = true; | ||
|
||
| SWSS_LOG_NOTICE("Warm-Restart timer expired."); | ||
| } | ||
| else if (temps == &reconcileHoldTimer) | ||
| { | ||
| readyToReconcile = true; | ||
| SWSS_LOG_NOTICE("Warm-Restart reconcile hold timer expired."); | ||
| } | ||
| else | ||
| { | ||
| readyToReconcile = sync.isReadyToReconcile(); | ||
|
||
| SWSS_LOG_NOTICE("Warm-Restart EOIU hold timer expired."); | ||
| } | ||
| if (sync.m_warmStartHelper.inProgress()) | ||
|
|
||
| if (sync.m_warmStartHelper.inProgress() && readyToReconcile) | ||
| { | ||
| sync.m_warmStartHelper.reconcile(); | ||
| SWSS_LOG_NOTICE("Warm-Restart reconciliation processed."); | ||
|
|
@@ -174,6 +197,30 @@ int main(int argc, char **argv) | |
| { | ||
| s.removeSelectable(&eoiuCheckTimer); | ||
| } | ||
| } | ||
| else if (temps == &reconcileCheckTimer) | ||
| { | ||
| SWSS_LOG_NOTICE("reconcile Check timer Expired "); | ||
| if (sync.m_warmStartHelper.inProgress()) | ||
| { | ||
| if (sync.isReadyToReconcile()) | ||
| { | ||
| reconcileHoldTimer.setInterval(timespec{2, 0}); | ||
|
||
| reconcileHoldTimer.start(); | ||
| s.addSelectable(&reconcileHoldTimer); | ||
| SWSS_LOG_NOTICE("Warm-Restart started reconcile hold timer "); | ||
| s.removeSelectable(&reconcileCheckTimer); | ||
| continue; | ||
| } | ||
| // re-start check timer for every 2 seconds now on | ||
| reconcileCheckTimer.setInterval(timespec{2, 0}); | ||
| reconcileCheckTimer.start(); | ||
| SWSS_LOG_NOTICE("Warm-Restart reconcileCheckTimer restarted"); | ||
| } | ||
| else | ||
| { | ||
| s.removeSelectable(&reconcileCheckTimer); | ||
| } | ||
| } | ||
| else if (!warmStartEnabled || sync.m_warmStartHelper.isReconciled()) | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -920,3 +920,31 @@ string RouteSync::getNextHopIf(struct rtnl_route *route_obj) | |
|
|
||
| return result; | ||
| } | ||
|
|
||
| // Check if ependent entries are re-conciled | ||
| bool RouteSync::isReadyToReconcile() | ||
| { | ||
| vector<string> required_modules = { | ||
| "orchagent", | ||
| }; | ||
|
|
||
| for(string& module : required_modules) | ||
nkelapur marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| { | ||
| WarmStart::WarmStartState state; | ||
|
|
||
| WarmStart::getWarmStartState(module, state); | ||
| if(state == WarmStart::RECONCILED || state == WarmStart::WSDISABLED) | ||
nkelapur marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| { | ||
| SWSS_LOG_INFO("Module %s Reconciled %d",module.c_str(), (int) state); | ||
| } | ||
| else | ||
| { | ||
| SWSS_LOG_INFO("Module %s NOT Reconciled %d",module.c_str(), (int) state); | ||
| //return false; | ||
| //Return true untill dependent module code is commited | ||
|
||
| return true; | ||
| } | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FdbSync::isIntfRestoreDone() , FdbSync::isReadyToReconcile() and RouteSync::isReadyToReconcile() are doing similar tasks, it seems we could make them into library calls with different input parameters. Later PR is probably fine.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure .. will consider in future updates to the code