Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions orchagent/routeorch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1332,7 +1332,11 @@ bool RouteOrch::addNextHopGroup(const NextHopGroupKey &nexthops)
nhopgroup_shared_set[next_hop_id].insert(it);
}
}

if (!next_hop_ids.size())
{
SWSS_LOG_INFO("Skipping creation of nexthop group as none of nexthop are active");
return false;
}
sai_attribute_t nhg_attr;
vector<sai_attribute_t> nhg_attrs;

Expand Down Expand Up @@ -1717,9 +1721,15 @@ void RouteOrch::addTempRoute(RouteBulkContext& ctx, const NextHopGroupKey &nextH
SWSS_LOG_INFO("Failed to get next hop %s for %s",
(*it).to_string().c_str(), ipPrefix.to_string().c_str());
it = next_hop_set.erase(it);
continue;
}
else
it++;
if(m_neighOrch->isNextHopFlagSet(*it, NHFLAGS_IFDOWN))
{
SWSS_LOG_INFO("Interface down for NH %s, skip this NH", (*it).to_string().c_str());
it = next_hop_set.erase(it);
continue;
}
it++;
}

/* Return if next_hop_set is empty */
Expand Down Expand Up @@ -2423,8 +2433,14 @@ bool RouteOrch::removeRoute(RouteBulkContext& ctx)
size_t creating = gRouteBulker.creating_entries_count(route_entry);
if (it_route == it_route_table->second.end() && creating == 0)
{
SWSS_LOG_INFO("Failed to find route entry, vrf_id 0x%" PRIx64 ", prefix %s\n", vrf_id,
ipPrefix.to_string().c_str());
if (it_route_table->second.size() == 0)
Copy link
Collaborator

@stephenxs stephenxs Jan 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect it's unsafe to remove m_syncdRoutes[vrf_id] here.
gRouteBulker.creating_entries_count(route_entry) can give unexpected result. it just counts the number of certain prefix that are pending creation.
however, if no such prefix but other prefixes belong to the VRF pending creation, it still returns 0 and the VRF routing table is removed.
After that, once the bulk returns, it can not find the VRF table when handling the routing entries, which causes an exception.

We saw the following uncaught C++ exception

(gdb) bt
#0  0x00007f5791aedebc in ?? () from /lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f5791a9efb2 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f5791a89472 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#3  0x00007f5791de0919 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#4  0x00007f5791debe1a in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#5  0x00007f5791debe85 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#6  0x00007f5791dec0d8 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
#7  0x00007f5791de3240 in std::__throw_out_of_range(char const*) () from /lib/x86_64-linux-gnu/libstdc++.so.6
#8  0x00005594e856d956 in std::map<unsigned long, std::map<swss::IpPrefix, RouteNhg, std::less<swss::IpPrefix>, std::allocator<std::pair<swss::IpPrefix const, RouteNhg> > >, std::less<unsigned long>, std::allocator<std::pair<unsigned long const, std::map<swss::IpPrefix, RouteNhg, std::less<swss::IpPrefix>, std::allocator<std::pair<swss::IpPrefix const, RouteNhg> > > > > >::at (this=<optimized out>, __k=<optimized out>) at /usr/include/c++/12/bits/stl_map.h:551
#9  0x00005594e8564beb in RouteOrch::addRoutePost (this=this@entry=0x5594ea13e080, ctx=..., nextHops=...) at ./orchagent/routeorch.cpp:2145
#10 0x00005594e856b0b2 in RouteOrch::doTask (this=0x5594ea13e080, consumer=...) at ./orchagent/routeorch.cpp:1021
#11 0x00005594e85282d2 in Orch::doTask (this=0x5594ea13e080) at ./orchagent/orch.cpp:553
#12 0x00005594e851909a in OrchDaemon::start (this=this@entry=0x5594ea0a0950) at ./orchagent/orchdaemon.cpp:895
#13 0x00005594e8485632 in main (argc=<optimized out>, argv=<optimized out>) at ./orchagent/main.cpp:818
(gdb) frame 9
#9  0x00005594e8564beb in RouteOrch::addRoutePost (this=this@entry=0x5594ea13e080, ctx=..., nextHops=...) at ./orchagent/routeorch.cpp:2145

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#3477 to fix it

{
m_syncdRoutes.erase(vrf_id);
m_vrfOrch->decreaseVrfRefCount(vrf_id);
}
SWSS_LOG_INFO("Failed to find route entry, vrf_id 0x%" PRIx64 ", prefix %s\n", vrf_id,
ipPrefix.to_string().c_str());

return true;
}

Expand Down
1 change: 1 addition & 0 deletions tests/mock_tests/routeorch_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ namespace routeorch_test
for (const auto &it : ports)
{
portTable.set(it.first, it.second);
portTable.set(it.first, {{ "oper_status", "up" }});
}

// Set PortConfigDone
Expand Down
53 changes: 32 additions & 21 deletions tests/test_sub_port_intf.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ def get_default_vrf_oid(self):
assert len(oids) == 1, "Wrong # of default vrfs: %d, expected #: 1." % (len(oids))
return oids[0]

def get_ip_prefix_nhg_oid(self, ip_prefix, vrf_oid=None):
def get_ip_prefix_nhg_oid(self, ip_prefix, vrf_oid=None, prefix_present=True):
if vrf_oid is None:
vrf_oid = self.default_vrf_oid

Expand All @@ -407,18 +407,24 @@ def _access_function():
route_entry_found = True
assert route_entry_key["vr"] == vrf_oid
break

return (route_entry_found, raw_route_entry_key)
if prefix_present:
return (route_entry_found, raw_route_entry_key)
else:
return (not route_entry_found, None)

(route_entry_found, raw_route_entry_key) = wait_for_result(_access_function)

fvs = self.asic_db.get_entry(ASIC_ROUTE_ENTRY_TABLE, raw_route_entry_key)
if not prefix_present:
assert raw_route_entry_key == None
return None
else:
fvs = self.asic_db.get_entry(ASIC_ROUTE_ENTRY_TABLE, raw_route_entry_key)

nhg_oid = fvs.get("SAI_ROUTE_ENTRY_ATTR_NEXT_HOP_ID", "")
assert nhg_oid != ""
assert nhg_oid != "oid:0x0"
nhg_oid = fvs.get("SAI_ROUTE_ENTRY_ATTR_NEXT_HOP_ID", "")
assert nhg_oid != ""
assert nhg_oid != "oid:0x0"

return nhg_oid
return nhg_oid

def check_sub_port_intf_key_existence(self, db, table_name, key):
db.wait_for_matching_keys(table_name, [key])
Expand Down Expand Up @@ -1543,21 +1549,26 @@ def _test_sub_port_intf_oper_down_with_pending_neigh_route_tasks(self, dvs, sub_
self.add_route_appl_db(ip_prefix, nhop_ips, ifnames, vrf_name)

# Verify route entry created in ASIC_DB and get next hop group oid
nhg_oid = self.get_ip_prefix_nhg_oid(ip_prefix, vrf_oid)

# Verify next hop group of the specified oid created in ASIC_DB
self.check_sub_port_intf_key_existence(self.asic_db, ASIC_NEXT_HOP_GROUP_TABLE, nhg_oid)
nhg_oid = self.get_ip_prefix_nhg_oid(ip_prefix, vrf_oid, prefix_present = i < (nhop_num - 1))

# Verify next hop group member # created in ASIC_DB
nhg_member_oids = self.asic_db.wait_for_n_keys(ASIC_NEXT_HOP_GROUP_MEMBER_TABLE,
(nhop_num - 1) - i if create_intf_on_parent_port == False else ((nhop_num - 1) - i) * 2)
if i < (nhop_num - 1):
# Verify next hop group of the specified oid created in ASIC_DB
self.check_sub_port_intf_key_existence(self.asic_db, ASIC_NEXT_HOP_GROUP_TABLE, nhg_oid)

# Verify that next hop group members all belong to the next hop group of the specified oid
fv_dict = {
"SAI_NEXT_HOP_GROUP_MEMBER_ATTR_NEXT_HOP_GROUP_ID": nhg_oid,
}
for nhg_member_oid in nhg_member_oids:
self.check_sub_port_intf_fvs(self.asic_db, ASIC_NEXT_HOP_GROUP_MEMBER_TABLE, nhg_member_oid, fv_dict)
# Verify next hop group member # created in ASIC_DB
nhg_member_oids = self.asic_db.wait_for_n_keys(ASIC_NEXT_HOP_GROUP_MEMBER_TABLE,
(nhop_num - 1) - i if create_intf_on_parent_port == False \
else ((nhop_num - 1) - i) * 2)

# Verify that next hop group members all belong to the next hop group of the specified oid
fv_dict = {
"SAI_NEXT_HOP_GROUP_MEMBER_ATTR_NEXT_HOP_GROUP_ID": nhg_oid,
}
for nhg_member_oid in nhg_member_oids:
self.check_sub_port_intf_fvs(self.asic_db, ASIC_NEXT_HOP_GROUP_MEMBER_TABLE, nhg_member_oid, fv_dict)
else:
assert nhg_oid == None
self.asic_db.wait_for_n_keys(ASIC_NEXT_HOP_GROUP_MEMBER_TABLE, 0)

nhop_cnt = len(self.asic_db.get_keys(ASIC_NEXT_HOP_TABLE))
# Remove next hop objects on sub port interfaces
Expand Down