Skip to content

Commit d3c3a7d

Browse files
[mux]: Implement rollback for failed mux switchovers (#2714) (#2761)
- Make all SAI API operations needed for switchover idempotent - Implement rollback when a switchover fails Signed-off-by: Lawrence Lee <[email protected]>
1 parent f977724 commit d3c3a7d

File tree

10 files changed

+766
-51
lines changed

10 files changed

+766
-51
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ swssconfig/swssplayer
7474
tlm_teamd/tlm_teamd
7575
teamsyncd/teamsyncd
7676
tests/tests
77+
tests/mock_tests/tests_response_publisher
78+
tests/mock_tests/tests_fpmsyncd
79+
tests/mock_tests/tests_intfmgrd
80+
tests/mock_tests/tests_portsyncd
7781

7882

7983
# Test Files #
@@ -85,5 +89,7 @@ tests/mock_tests/tests.trs
8589
tests/test-suite.log
8690
tests/tests.log
8791
tests/tests.trs
92+
tests/mock_tests/**/*log
93+
tests/mock_tests/**/*trs
8894
orchagent/p4orch/tests/**/*gcda
8995
orchagent/p4orch/tests/**/*gcno

orchagent/aclorch.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,11 @@ bool AclRule::createRule()
10751075
status = sai_acl_api->create_acl_entry(&m_ruleOid, gSwitchId, (uint32_t)rule_attrs.size(), rule_attrs.data());
10761076
if (status != SAI_STATUS_SUCCESS)
10771077
{
1078+
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS)
1079+
{
1080+
SWSS_LOG_NOTICE("ACL rule %s already exists", m_id.c_str());
1081+
return true;
1082+
}
10781083
SWSS_LOG_ERROR("Failed to create ACL rule %s, rv:%d",
10791084
m_id.c_str(), status);
10801085
AclRange::remove(range_objects, range_object_list.count);
@@ -1136,6 +1141,12 @@ bool AclRule::removeRule()
11361141
auto status = sai_acl_api->remove_acl_entry(m_ruleOid);
11371142
if (status != SAI_STATUS_SUCCESS)
11381143
{
1144+
if (status == SAI_STATUS_ITEM_NOT_FOUND)
1145+
{
1146+
SWSS_LOG_NOTICE("ACL rule already deleted");
1147+
m_ruleOid = SAI_NULL_OBJECT_ID;
1148+
return true;
1149+
}
11391150
SWSS_LOG_ERROR("Failed to delete ACL rule, status %s", sai_serialize_status(status).c_str());
11401151
return false;
11411152
}

orchagent/muxorch.cpp

Lines changed: 83 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ static sai_status_t create_route(IpPrefix &pfx, sai_object_id_t nh)
123123
sai_status_t status = sai_route_api->create_route_entry(&route_entry, (uint32_t)attrs.size(), attrs.data());
124124
if (status != SAI_STATUS_SUCCESS)
125125
{
126+
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS) {
127+
SWSS_LOG_NOTICE("Tunnel route to %s already exists", pfx.to_string().c_str());
128+
return SAI_STATUS_SUCCESS;
129+
}
126130
SWSS_LOG_ERROR("Failed to create tunnel route %s,nh %" PRIx64 " rv:%d",
127131
pfx.getIp().to_string().c_str(), nh, status);
128132
return status;
@@ -152,6 +156,10 @@ static sai_status_t remove_route(IpPrefix &pfx)
152156
sai_status_t status = sai_route_api->remove_route_entry(&route_entry);
153157
if (status != SAI_STATUS_SUCCESS)
154158
{
159+
if (status == SAI_STATUS_ITEM_NOT_FOUND) {
160+
SWSS_LOG_NOTICE("Tunnel route to %s already removed", pfx.to_string().c_str());
161+
return SAI_STATUS_SUCCESS;
162+
}
155163
SWSS_LOG_ERROR("Failed to remove tunnel route %s, rv:%d",
156164
pfx.getIp().to_string().c_str(), status);
157165
return status;
@@ -504,15 +512,15 @@ void MuxCable::setState(string new_state)
504512

505513
mux_cb_orch_->updateMuxMetricState(mux_name_, new_state, true);
506514

507-
MuxState state = state_;
515+
prev_state_ = state_;
508516
state_ = ns;
509517

510518
st_chg_in_progress_ = true;
511519

512520
if (!(this->*(state_machine_handlers_[it->second]))())
513521
{
514522
//Reset back to original state
515-
state_ = state;
523+
state_ = prev_state_;
516524
st_chg_in_progress_ = false;
517525
st_chg_failed_ = true;
518526
throw std::runtime_error("Failed to handle state transition");
@@ -528,6 +536,51 @@ void MuxCable::setState(string new_state)
528536
return;
529537
}
530538

539+
void MuxCable::rollbackStateChange()
540+
{
541+
if (prev_state_ == MuxState::MUX_STATE_FAILED || prev_state_ == MuxState::MUX_STATE_PENDING)
542+
{
543+
SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(),
544+
muxStateValToString.at(prev_state_).c_str());
545+
return;
546+
}
547+
SWSS_LOG_WARN("[%s] Rolling back state change to %s", mux_name_.c_str(),
548+
muxStateValToString.at(prev_state_).c_str());
549+
mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(prev_state_), true);
550+
st_chg_in_progress_ = true;
551+
state_ = prev_state_;
552+
bool success = false;
553+
switch (prev_state_)
554+
{
555+
case MuxState::MUX_STATE_ACTIVE:
556+
success = stateActive();
557+
break;
558+
case MuxState::MUX_STATE_INIT:
559+
case MuxState::MUX_STATE_STANDBY:
560+
success = stateStandby();
561+
break;
562+
case MuxState::MUX_STATE_FAILED:
563+
case MuxState::MUX_STATE_PENDING:
564+
// Check at the start of the function means we will never reach here
565+
SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(),
566+
muxStateValToString.at(prev_state_).c_str());
567+
return;
568+
}
569+
st_chg_in_progress_ = false;
570+
if (success)
571+
{
572+
st_chg_failed_ = false;
573+
}
574+
else
575+
{
576+
st_chg_failed_ = true;
577+
SWSS_LOG_ERROR("[%s] Rollback to %s failed",
578+
mux_name_.c_str(), muxStateValToString.at(prev_state_).c_str());
579+
}
580+
mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(state_), false);
581+
mux_cb_orch_->updateMuxState(mux_name_, muxStateValToString.at(state_));
582+
}
583+
531584
string MuxCable::getState()
532585
{
533586
SWSS_LOG_INFO("Get state request for %s, state %s",
@@ -845,8 +898,6 @@ void MuxNbrHandler::updateTunnelRoute(NextHopKey nh, bool add)
845898
}
846899
}
847900

848-
std::map<std::string, AclTable> MuxAclHandler::acl_table_;
849-
850901
MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias)
851902
{
852903
SWSS_LOG_ENTER();
@@ -858,32 +909,21 @@ MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias)
858909
port_ = port;
859910
alias_ = alias;
860911

861-
auto found = acl_table_.find(table_name);
862-
if (found == acl_table_.end())
863-
{
864-
SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port);
912+
// Always try to create the table first. If it already exists, function will return early.
913+
createMuxAclTable(port, table_name);
865914

866-
// First time handling of Mux Table, create ACL table, and bind
867-
createMuxAclTable(port, table_name);
915+
SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port);
916+
917+
AclRule* rule = gAclOrch->getAclRule(table_name, rule_name);
918+
if (rule == nullptr)
919+
{
868920
shared_ptr<AclRulePacket> newRule =
869921
make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/);
870922
createMuxAclRule(newRule, table_name);
871923
}
872924
else
873925
{
874-
SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port);
875-
876-
AclRule* rule = gAclOrch->getAclRule(table_name, rule_name);
877-
if (rule == nullptr)
878-
{
879-
shared_ptr<AclRulePacket> newRule =
880-
make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/);
881-
createMuxAclRule(newRule, table_name);
882-
}
883-
else
884-
{
885-
gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD);
886-
}
926+
gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD);
887927
}
888928
}
889929

@@ -916,23 +956,16 @@ void MuxAclHandler::createMuxAclTable(sai_object_id_t port, string strTable)
916956
{
917957
SWSS_LOG_ENTER();
918958

919-
auto inserted = acl_table_.emplace(piecewise_construct,
920-
std::forward_as_tuple(strTable),
921-
std::forward_as_tuple(gAclOrch, strTable));
922-
923-
assert(inserted.second);
924-
925-
AclTable& acl_table = inserted.first->second;
926-
927959
sai_object_id_t table_oid = gAclOrch->getTableById(strTable);
928960
if (table_oid != SAI_NULL_OBJECT_ID)
929961
{
930962
// DROP ACL table is already created
931-
SWSS_LOG_NOTICE("ACL table %s exists, reuse the same", strTable.c_str());
932-
acl_table = *(gAclOrch->getTableByOid(table_oid));
963+
SWSS_LOG_INFO("ACL table %s exists, reuse the same", strTable.c_str());
933964
return;
934965
}
935966

967+
SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port);
968+
AclTable acl_table(gAclOrch, strTable);
936969
auto dropType = gAclOrch->getAclTableType(TABLE_TYPE_DROP);
937970
assert(dropType);
938971
acl_table.validateAddType(*dropType);
@@ -1770,10 +1803,25 @@ bool MuxCableOrch::addOperation(const Request& request)
17701803
{
17711804
mux_obj->setState(state);
17721805
}
1773-
catch(const std::runtime_error& error)
1806+
catch(const std::runtime_error& e)
17741807
{
17751808
SWSS_LOG_ERROR("Mux Error setting state %s for port %s. Error: %s",
1776-
state.c_str(), port_name.c_str(), error.what());
1809+
state.c_str(), port_name.c_str(), e.what());
1810+
mux_obj->rollbackStateChange();
1811+
return true;
1812+
}
1813+
catch (const std::logic_error& e)
1814+
{
1815+
SWSS_LOG_ERROR("Logic error while setting state %s for port %s. Error: %s",
1816+
state.c_str(), port_name.c_str(), e.what());
1817+
mux_obj->rollbackStateChange();
1818+
return true;
1819+
}
1820+
catch (const std::exception& e)
1821+
{
1822+
SWSS_LOG_ERROR("Exception caught while setting state %s for port %s. Error: %s",
1823+
state.c_str(), port_name.c_str(), e.what());
1824+
mux_obj->rollbackStateChange();
17771825
return true;
17781826
}
17791827

orchagent/muxorch.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,6 @@ class MuxAclHandler
5252
void createMuxAclRule(shared_ptr<AclRulePacket> rule, string strTable);
5353
void bindAllPorts(AclTable &acl_table);
5454

55-
// class shared dict: ACL table name -> ACL table
56-
static std::map<std::string, AclTable> acl_table_;
5755
sai_object_id_t port_ = SAI_NULL_OBJECT_ID;
5856
string alias_;
5957
};
@@ -98,6 +96,7 @@ class MuxCable
9896
using state_machine_handlers = map<MuxStateChange, bool (MuxCable::*)()>;
9997

10098
void setState(string state);
99+
void rollbackStateChange();
101100
string getState();
102101
bool isStateChangeInProgress() { return st_chg_in_progress_; }
103102
bool isStateChangeFailed() { return st_chg_failed_; }
@@ -122,6 +121,7 @@ class MuxCable
122121
MuxCableType cable_type_;
123122

124123
MuxState state_ = MuxState::MUX_STATE_INIT;
124+
MuxState prev_state_;
125125
bool st_chg_in_progress_ = false;
126126
bool st_chg_failed_ = false;
127127

orchagent/neighorch.cpp

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,12 @@ bool NeighOrch::addNextHop(const NextHopKey &nh)
255255
sai_status_t status = sai_next_hop_api->create_next_hop(&next_hop_id, gSwitchId, (uint32_t)next_hop_attrs.size(), next_hop_attrs.data());
256256
if (status != SAI_STATUS_SUCCESS)
257257
{
258+
if (status == SAI_STATUS_ITEM_ALREADY_EXISTS)
259+
{
260+
SWSS_LOG_NOTICE("Next hop %s on %s already exists",
261+
nexthop.ip_address.to_string().c_str(), nexthop.alias.c_str());
262+
return true;
263+
}
258264
SWSS_LOG_ERROR("Failed to create next hop %s on %s, rv:%d",
259265
nexthop.ip_address.to_string().c_str(), nexthop.alias.c_str(), status);
260266
task_process_status handle_status = handleSaiCreateStatus(SAI_API_NEXT_HOP, status);
@@ -1011,7 +1017,7 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
10111017
/* When next hop is not found, we continue to remove neighbor entry. */
10121018
if (status == SAI_STATUS_ITEM_NOT_FOUND)
10131019
{
1014-
SWSS_LOG_ERROR("Failed to locate next hop %s on %s, rv:%d",
1020+
SWSS_LOG_NOTICE("Next hop %s on %s doesn't exist, rv:%d",
10151021
ip_address.to_string().c_str(), alias.c_str(), status);
10161022
}
10171023
else
@@ -1046,9 +1052,8 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
10461052
{
10471053
if (status == SAI_STATUS_ITEM_NOT_FOUND)
10481054
{
1049-
SWSS_LOG_ERROR("Failed to locate neighbor %s on %s, rv:%d",
1055+
SWSS_LOG_NOTICE("Neighbor %s on %s already removed, rv:%d",
10501056
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str(), status);
1051-
return true;
10521057
}
10531058
else
10541059
{
@@ -1061,22 +1066,24 @@ bool NeighOrch::removeNeighbor(const NeighborEntry &neighborEntry, bool disable)
10611066
}
10621067
}
10631068
}
1064-
1065-
if (neighbor_entry.ip_address.addr_family == SAI_IP_ADDR_FAMILY_IPV4)
1066-
{
1067-
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV4_NEIGHBOR);
1068-
}
10691069
else
10701070
{
1071-
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV6_NEIGHBOR);
1072-
}
1071+
if (neighbor_entry.ip_address.addr_family == SAI_IP_ADDR_FAMILY_IPV4)
1072+
{
1073+
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV4_NEIGHBOR);
1074+
}
1075+
else
1076+
{
1077+
gCrmOrch->decCrmResUsedCounter(CrmResourceType::CRM_IPV6_NEIGHBOR);
1078+
}
10731079

1074-
removeNextHop(ip_address, alias);
1075-
m_intfsOrch->decreaseRouterIntfsRefCount(alias);
1080+
removeNextHop(ip_address, alias);
1081+
m_intfsOrch->decreaseRouterIntfsRefCount(alias);
1082+
SWSS_LOG_NOTICE("Removed neighbor %s on %s",
1083+
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str());
1084+
}
10761085
}
10771086

1078-
SWSS_LOG_NOTICE("Removed neighbor %s on %s",
1079-
m_syncdNeighbors[neighborEntry].mac.to_string().c_str(), alias.c_str());
10801087

10811088
/* Do not delete entry from cache if its disable request */
10821089
if (disable)

tests/mock_tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ tests_SOURCES = aclorch_ut.cpp \
4747
flowcounterrouteorch_ut.cpp \
4848
orchdaemon_ut.cpp \
4949
intfsorch_ut.cpp \
50+
mux_rollback_ut.cpp \
5051
test_failure_handling.cpp \
5152
$(top_srcdir)/lib/gearboxutils.cpp \
5253
$(top_srcdir)/lib/subintf.cpp \

tests/mock_tests/mock_orchagent_main.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ extern VRFOrch *gVrfOrch;
5656
extern NhgOrch *gNhgOrch;
5757
extern Srv6Orch *gSrv6Orch;
5858
extern BfdOrch *gBfdOrch;
59+
extern AclOrch *gAclOrch;
60+
extern PolicerOrch *gPolicerOrch;
5961
extern Directory<Orch*> gDirectory;
6062

6163
extern sai_acl_api_t *sai_acl_api;
@@ -70,6 +72,7 @@ extern sai_route_api_t *sai_route_api;
7072
extern sai_neighbor_api_t *sai_neighbor_api;
7173
extern sai_tunnel_api_t *sai_tunnel_api;
7274
extern sai_next_hop_api_t *sai_next_hop_api;
75+
extern sai_next_hop_group_api_t *sai_next_hop_group_api;
7376
extern sai_hostif_api_t *sai_hostif_api;
7477
extern sai_policer_api_t *sai_policer_api;
7578
extern sai_buffer_api_t *sai_buffer_api;

0 commit comments

Comments
 (0)