-
Notifications
You must be signed in to change notification settings - Fork 692
[mux]: Implement rollback for failed mux switchovers #2714
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d410ab2
c7d3e13
fdf2725
eda687a
ee1f47d
bbe6677
c28207c
99104d0
7e3fcb7
d1de878
13b1f68
0724b52
29db8a2
7e7d956
c0c91f2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -116,6 +116,10 @@ static sai_status_t create_route(IpPrefix &pfx, sai_object_id_t nh) | |
| sai_status_t status = sai_route_api->create_route_entry(&route_entry, (uint32_t)attrs.size(), attrs.data()); | ||
| if (status != SAI_STATUS_SUCCESS) | ||
| { | ||
| if (status == SAI_STATUS_ITEM_ALREADY_EXISTS) { | ||
| SWSS_LOG_NOTICE("Tunnel route to %s already exists", pfx.to_string().c_str()); | ||
| return SAI_STATUS_SUCCESS; | ||
| } | ||
| SWSS_LOG_ERROR("Failed to create tunnel route %s,nh %" PRIx64 " rv:%d", | ||
| pfx.getIp().to_string().c_str(), nh, status); | ||
| return status; | ||
|
|
@@ -145,6 +149,10 @@ static sai_status_t remove_route(IpPrefix &pfx) | |
| sai_status_t status = sai_route_api->remove_route_entry(&route_entry); | ||
| if (status != SAI_STATUS_SUCCESS) | ||
| { | ||
| if (status == SAI_STATUS_ITEM_NOT_FOUND) { | ||
| SWSS_LOG_NOTICE("Tunnel route to %s already removed", pfx.to_string().c_str()); | ||
| return SAI_STATUS_SUCCESS; | ||
| } | ||
| SWSS_LOG_ERROR("Failed to remove tunnel route %s, rv:%d", | ||
| pfx.getIp().to_string().c_str(), status); | ||
| return status; | ||
|
|
@@ -469,15 +477,15 @@ void MuxCable::setState(string new_state) | |
|
|
||
| mux_cb_orch_->updateMuxMetricState(mux_name_, new_state, true); | ||
|
|
||
| MuxState state = state_; | ||
| prev_state_ = state_; | ||
| state_ = ns; | ||
|
|
||
| st_chg_in_progress_ = true; | ||
|
|
||
| if (!(this->*(state_machine_handlers_[it->second]))()) | ||
| { | ||
| //Reset back to original state | ||
| state_ = state; | ||
| state_ = prev_state_; | ||
| st_chg_in_progress_ = false; | ||
| st_chg_failed_ = true; | ||
| throw std::runtime_error("Failed to handle state transition"); | ||
|
|
@@ -493,6 +501,51 @@ void MuxCable::setState(string new_state) | |
| return; | ||
| } | ||
|
|
||
| void MuxCable::rollbackStateChange() | ||
| { | ||
| if (prev_state_ == MuxState::MUX_STATE_FAILED || prev_state_ == MuxState::MUX_STATE_PENDING) | ||
| { | ||
| SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(), | ||
| muxStateValToString.at(prev_state_).c_str()); | ||
| return; | ||
| } | ||
| SWSS_LOG_WARN("[%s] Rolling back state change to %s", mux_name_.c_str(), | ||
| muxStateValToString.at(prev_state_).c_str()); | ||
| mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(prev_state_), true); | ||
| st_chg_in_progress_ = true; | ||
| state_ = prev_state_; | ||
| bool success = false; | ||
| switch (prev_state_) | ||
| { | ||
| case MuxState::MUX_STATE_ACTIVE: | ||
| success = stateActive(); | ||
| break; | ||
| case MuxState::MUX_STATE_INIT: | ||
| case MuxState::MUX_STATE_STANDBY: | ||
| success = stateStandby(); | ||
| break; | ||
| case MuxState::MUX_STATE_FAILED: | ||
| case MuxState::MUX_STATE_PENDING: | ||
| // Check at the start of the function means we will never reach here | ||
| SWSS_LOG_ERROR("[%s] Rollback to %s not supported", mux_name_.c_str(), | ||
| muxStateValToString.at(prev_state_).c_str()); | ||
| return; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will leave
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but we will never get here since the check on the first line of this function will return early for |
||
| } | ||
| st_chg_in_progress_ = false; | ||
| if (success) | ||
| { | ||
| st_chg_failed_ = false; | ||
| } | ||
| else | ||
| { | ||
| st_chg_failed_ = true; | ||
| SWSS_LOG_ERROR("[%s] Rollback to %s failed", | ||
| mux_name_.c_str(), muxStateValToString.at(prev_state_).c_str()); | ||
| } | ||
| mux_cb_orch_->updateMuxMetricState(mux_name_, muxStateValToString.at(state_), false); | ||
| mux_cb_orch_->updateMuxState(mux_name_, muxStateValToString.at(state_)); | ||
| } | ||
|
|
||
| string MuxCable::getState() | ||
| { | ||
| SWSS_LOG_INFO("Get state request for %s, state %s", | ||
|
|
@@ -785,8 +838,6 @@ void MuxNbrHandler::updateTunnelRoute(NextHopKey nh, bool add) | |
| } | ||
| } | ||
|
|
||
| std::map<std::string, AclTable> MuxAclHandler::acl_table_; | ||
|
|
||
| MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias) | ||
| { | ||
| SWSS_LOG_ENTER(); | ||
|
|
@@ -804,32 +855,21 @@ MuxAclHandler::MuxAclHandler(sai_object_id_t port, string alias) | |
| port_ = port; | ||
| alias_ = alias; | ||
|
|
||
| auto found = acl_table_.find(table_name); | ||
| if (found == acl_table_.end()) | ||
| { | ||
| SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port); | ||
| // Always try to create the table first. If it already exists, function will return early. | ||
| createMuxAclTable(port, table_name); | ||
|
|
||
| // First time handling of Mux Table, create ACL table, and bind | ||
| createMuxAclTable(port, table_name); | ||
| SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port); | ||
|
|
||
| AclRule* rule = gAclOrch->getAclRule(table_name, rule_name); | ||
| if (rule == nullptr) | ||
| { | ||
| shared_ptr<AclRulePacket> newRule = | ||
| make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/); | ||
| createMuxAclRule(newRule, table_name); | ||
| } | ||
| else | ||
| { | ||
| SWSS_LOG_NOTICE("Binding port %" PRIx64 "", port); | ||
|
|
||
| AclRule* rule = gAclOrch->getAclRule(table_name, rule_name); | ||
| if (rule == nullptr) | ||
| { | ||
| shared_ptr<AclRulePacket> newRule = | ||
| make_shared<AclRulePacket>(gAclOrch, rule_name, table_name, false /*no counters*/); | ||
| createMuxAclRule(newRule, table_name); | ||
| } | ||
| else | ||
| { | ||
| gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD); | ||
| } | ||
| gAclOrch->updateAclRule(table_name, rule_name, MATCH_IN_PORTS, &port, RULE_OPER_ADD); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -862,23 +902,16 @@ void MuxAclHandler::createMuxAclTable(sai_object_id_t port, string strTable) | |
| { | ||
| SWSS_LOG_ENTER(); | ||
|
|
||
| auto inserted = acl_table_.emplace(piecewise_construct, | ||
| std::forward_as_tuple(strTable), | ||
| std::forward_as_tuple(gAclOrch, strTable)); | ||
|
|
||
| assert(inserted.second); | ||
|
|
||
| AclTable& acl_table = inserted.first->second; | ||
|
|
||
| sai_object_id_t table_oid = gAclOrch->getTableById(strTable); | ||
| if (table_oid != SAI_NULL_OBJECT_ID) | ||
| { | ||
| // DROP ACL table is already created | ||
| SWSS_LOG_NOTICE("ACL table %s exists, reuse the same", strTable.c_str()); | ||
| acl_table = *(gAclOrch->getTableByOid(table_oid)); | ||
| SWSS_LOG_INFO("ACL table %s exists, reuse the same", strTable.c_str()); | ||
| return; | ||
| } | ||
|
|
||
| SWSS_LOG_NOTICE("First time create for port %" PRIx64 "", port); | ||
| AclTable acl_table(gAclOrch, strTable); | ||
| auto dropType = gAclOrch->getAclTableType(TABLE_TYPE_DROP); | ||
| assert(dropType); | ||
| acl_table.validateAddType(*dropType); | ||
|
|
@@ -1582,10 +1615,25 @@ bool MuxCableOrch::addOperation(const Request& request) | |
| { | ||
| mux_obj->setState(state); | ||
| } | ||
| catch(const std::runtime_error& error) | ||
| catch(const std::runtime_error& e) | ||
| { | ||
| SWSS_LOG_ERROR("Mux Error setting state %s for port %s. Error: %s", | ||
| state.c_str(), port_name.c_str(), error.what()); | ||
| state.c_str(), port_name.c_str(), e.what()); | ||
| mux_obj->rollbackStateChange(); | ||
| return true; | ||
| } | ||
| catch (const std::logic_error& e) | ||
| { | ||
| SWSS_LOG_ERROR("Logic error while setting state %s for port %s. Error: %s", | ||
| state.c_str(), port_name.c_str(), e.what()); | ||
| mux_obj->rollbackStateChange(); | ||
| return true; | ||
| } | ||
| catch (const std::exception& e) | ||
| { | ||
| SWSS_LOG_ERROR("Exception caught while setting state %s for port %s. Error: %s", | ||
| state.c_str(), port_name.c_str(), e.what()); | ||
| mux_obj->rollbackStateChange(); | ||
| return true; | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,8 +52,6 @@ class MuxAclHandler | |
| void createMuxAclRule(shared_ptr<AclRulePacket> rule, string strTable); | ||
| void bindAllPorts(AclTable &acl_table); | ||
|
|
||
| // class shared dict: ACL table name -> ACL table | ||
| static std::map<std::string, AclTable> acl_table_; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this removed? i think its changing the overall logic.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. based on my understanding, this was used to cache if the ACL table had already been created in hardware. I changed the behavior to delegate this check to AclOrch instead, since the |
||
| sai_object_id_t port_ = SAI_NULL_OBJECT_ID; | ||
| bool is_ingress_acl_ = true; | ||
| string alias_; | ||
|
|
@@ -97,6 +95,7 @@ class MuxCable | |
| using state_machine_handlers = map<MuxStateChange, bool (MuxCable::*)()>; | ||
|
|
||
| void setState(string state); | ||
| void rollbackStateChange(); | ||
| string getState(); | ||
| bool isStateChangeInProgress() { return st_chg_in_progress_; } | ||
| bool isStateChangeFailed() { return st_chg_failed_; } | ||
|
|
@@ -120,6 +119,7 @@ class MuxCable | |
| MuxCableType cable_type_; | ||
|
|
||
| MuxState state_ = MuxState::MUX_STATE_INIT; | ||
| MuxState prev_state_; | ||
| bool st_chg_in_progress_ = false; | ||
| bool st_chg_failed_ = false; | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@theasianpianist, is this change required for mux idempotency/rollback? @bingwang-ms , can you please review?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the ACL entry already exists, the SAI API will return this status, when we rollback we might hit this scenario so want to continue normally if the entry does already exist.