Commit 7407dbf
[RlsLB] Fix Deadlock (grpc#37459)
Internal bug: b/357864682
A lock ordering inversion was noticed with the following stacks -
```
[mutex.cc : 1418] RAW: Potential Mutex deadlock:
@ 0x564f4ce62fe5 absl::lts_20240116::DebugOnlyDeadlockCheck()
@ 0x564f4ce632dc absl::lts_20240116::Mutex::Lock()
@ 0x564f4be5886c absl::lts_20240116::MutexLock::MutexLock()
@ 0x564f4be968c5 grpc::internal::OpenTelemetryPluginImpl::RemoveCallback()
@ 0x564f4cd097b8 grpc_core::RegisteredMetricCallback::~RegisteredMetricCallback()
@ 0x564f4c1f1216 std::default_delete<>::operator()()
@ 0x564f4c1f157f std::__uniq_ptr_impl<>::reset()
@ 0x564f4c1ee967 std::unique_ptr<>::reset()
@ 0x564f4c352f44 grpc_core::GrpcXdsClient::Orphaned()
@ 0x564f4c25dad1 grpc_core::DualRefCounted<>::Unref()
@ 0x564f4c4653ed grpc_core::RefCountedPtr<>::reset()
@ 0x564f4c463c73 grpc_core::XdsClusterDropStats::~XdsClusterDropStats()
@ 0x564f4c463d02 grpc_core::XdsClusterDropStats::~XdsClusterDropStats()
@ 0x564f4c25efa9 grpc_core::UnrefDelete::operator()<>()
@ 0x564f4c25d5f0 grpc_core::RefCounted<>::Unref()
@ 0x564f4c25c2d9 grpc_core::RefCountedPtr<>::~RefCountedPtr()
@ 0x564f4c25b1d8 grpc_core::(anonymous namespace)::XdsClusterImplLb::Picker::~Picker()
@ 0x564f4c25b240 grpc_core::(anonymous namespace)::XdsClusterImplLb::Picker::~Picker()
@ 0x564f4c12c71a grpc_core::UnrefDelete::operator()<>()
@ 0x564f4c1292ac grpc_core::DualRefCounted<>::WeakUnref()
@ 0x564f4c124fb8 grpc_core::DualRefCounted<>::Unref()
@ 0x564f4c11f029 grpc_core::RefCountedPtr<>::~RefCountedPtr()
@ 0x564f4c14e958 grpc_core::(anonymous namespace)::OutlierDetectionLb::Picker::~Picker()
@ 0x564f4c14e980 grpc_core::(anonymous namespace)::OutlierDetectionLb::Picker::~Picker()
@ 0x564f4c12c71a grpc_core::UnrefDelete::operator()<>()
@ 0x564f4c1292ac grpc_core::DualRefCounted<>::WeakUnref()
@ 0x564f4c124fb8 grpc_core::DualRefCounted<>::Unref()
@ 0x564f4c11f029 grpc_core::RefCountedPtr<>::~RefCountedPtr()
@ 0x564f4c26bafc std::pair<>::~pair()
@ 0x564f4c26bb28 __gnu_cxx::new_allocator<>::destroy<>()
@ 0x564f4c26b88f std::allocator_traits<>::destroy<>()
@ 0x564f4c26b297 std::_Rb_tree<>::_M_destroy_node()
@ 0x564f4c26abfb std::_Rb_tree<>::_M_drop_node()
@ 0x564f4c26a926 std::_Rb_tree<>::_M_erase()
@ 0x564f4c26a6f0 std::_Rb_tree<>::~_Rb_tree()
@ 0x564f4c26a62a std::map<>::~map()
@ 0x564f4c2691a4 grpc_core::(anonymous namespace)::XdsClusterManagerLb::ClusterPicker::~ClusterPicker()
@ 0x564f4c2691cc grpc_core::(anonymous namespace)::XdsClusterManagerLb::ClusterPicker::~ClusterPicker()
@ 0x564f4c12c71a grpc_core::UnrefDelete::operator()<>()
@ 0x564f4c1292ac grpc_core::DualRefCounted<>::WeakUnref()
[mutex.cc : 1428] RAW: Acquiring absl::Mutex 0x564f4f22ad40 while holding 0x7f939834bb70; a cycle in the historical lock ordering graph has been observed
[mutex.cc : 1432] RAW: Cycle:
[mutex.cc : 1446] RAW: mutex@0x564f4f22ad40 stack:
@ 0x564f4ce62fe5 absl::lts_20240116::DebugOnlyDeadlockCheck()
@ 0x564f4ce632dc absl::lts_20240116::Mutex::Lock()
@ 0x564f4be5886c absl::lts_20240116::MutexLock::MutexLock()
@ 0x564f4be96124 grpc::internal::OpenTelemetryPluginImpl::AddCallback()
@ 0x564f4cd096f0 grpc_core::RegisteredMetricCallback::RegisteredMetricCallback()
@ 0x564f4c1f111b std::make_unique<>()
@ 0x564f4c3564b0 grpc_core::GlobalStatsPluginRegistry::StatsPluginGroup::RegisterCallback<>()
@ 0x564f4c352dea grpc_core::GrpcXdsClient::GrpcXdsClient()
@ 0x564f4c355bc6 grpc_core::MakeRefCounted<>()
@ 0x564f4c3525f2 grpc_core::GrpcXdsClient::GetOrCreate()
@ 0x564f4c28f8f8 grpc_core::(anonymous namespace)::XdsResolver::StartLocked()
@ 0x564f4c2f5f82 grpc_core::(anonymous namespace)::GoogleCloud2ProdResolver::StartXdsResolver()
@ 0x564f4c2f515d grpc_core::(anonymous namespace)::GoogleCloud2ProdResolver::ZoneQueryDone()
@ 0x564f4c2f496b grpc_core::(anonymous namespace)::GoogleCloud2ProdResolver::StartLocked()::{lambda()#1}::operator()()::{lambda()#1}::operator()()
@ 0x564f4c2f80f6 std::__invoke_impl<>()
@ 0x564f4c2f7b9d _ZSt10__invoke_rIvRZZN9grpc_core12_GLOBAL__N_124GoogleCloud2ProdResolver11StartLockedEvENUlNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEN4absl12lts_202401168StatusOrIS8_EEE_clES8_SC_EUlvE_J...
@ 0x564f4c2f748c std::_Function_handler<>::_M_invoke()
@ 0x564f4b8ad682 std::function<>::operator()()
@ 0x564f4cd1c6bf grpc_core::WorkSerializer::LegacyWorkSerializer::Run()
@ 0x564f4cd1dae4 grpc_core::WorkSerializer::Run()
@ 0x564f4c2f4b0b grpc_core::(anonymous namespace)::GoogleCloud2ProdResolver::StartLocked()::{lambda()#1}::operator()()
@ 0x564f4c2f8dc7 absl::lts_20240116::base_internal::Callable::Invoke<>()
@ 0x564f4c2f8cb8 absl::lts_20240116::base_internal::invoke<>()
@ 0x564f4c2f8b16 absl::lts_20240116::internal_any_invocable::InvokeR<>()
@ 0x564f4c2f8a0c absl::lts_20240116::internal_any_invocable::LocalInvoker<>()
@ 0x564f4c2fb88d absl::lts_20240116::internal_any_invocable::Impl<>::operator()()
@ 0x564f4c2fb1f3 grpc_core::GcpMetadataQuery::OnDone()
@ 0x564f4cd75a72 exec_ctx_run()
@ 0x564f4cd75ba9 grpc_core::ExecCtx::Flush()
@ 0x564f4cc8ee1d end_worker()
@ 0x564f4cc8f304 pollset_work()
@ 0x564f4cc5dcaf pollset_work()
@ 0x564f4cc69220 grpc_pollset_work()
@ 0x564f4cbe7733 cq_pluck()
@ 0x564f4cbe7ad5 grpc_completion_queue_pluck
@ 0x564f4bc61d96 grpc::CompletionQueue::Pluck()
@ 0x564f4bfdb055 grpc::ClientReader<>::ClientReader<>()
@ 0x564f4bfd6035 grpc::internal::ClientReaderFactory<>::Create<>()
@ 0x564f4bfc322b google::storage::v2::Storage::Stub::ReadObjectRaw()
@ 0x564f4bf9934b google::storage::v2::Storage::Stub::ReadObject()
[mutex.cc : 1446] RAW: mutex@0x7f939834bb70 stack:
@ 0x564f4ce62fe5 absl::lts_20240116::DebugOnlyDeadlockCheck()
@ 0x564f4ce632dc absl::lts_20240116::Mutex::Lock()
@ 0x564f4be5886c absl::lts_20240116::MutexLock::MutexLock()
@ 0x564f4c1ce9eb grpc_core::(anonymous namespace)::RlsLb::RlsLb()::{lambda()#1}::operator()()
@ 0x564f4c1e794c absl::lts_20240116::base_internal::Callable::Invoke<>()
@ 0x564f4c1e72c1 absl::lts_20240116::base_internal::invoke<>()
@ 0x564f4c1e6af1 absl::lts_20240116::internal_any_invocable::InvokeR<>()
@ 0x564f4c1e5d6c absl::lts_20240116::internal_any_invocable::LocalInvoker<>()
@ 0x564f4be9d0c8 absl::lts_20240116::internal_any_invocable::Impl<>::operator()()
@ 0x564f4be9b4ff grpc_core::RegisteredMetricCallback::Run()
@ 0x564f4bea07ae grpc::internal::OpenTelemetryPluginImpl::CallbackGaugeState<>::CallbackGaugeCallback()
@ 0x564f4bf844de opentelemetry::v1::sdk::metrics::ObservableRegistry::Observe()
@ 0x564f4bf56529 opentelemetry::v1::sdk::metrics::Meter::Collect()
@ 0x564f4bf8c1d5 opentelemetry::v1::sdk::metrics::MetricCollector::Collect()::{lambda()#1}::operator()()
@ 0x564f4bf8c5ac opentelemetry::v1::nostd::function_ref<>::BindTo<>()::{lambda()#1}::operator()()
@ 0x564f4bf8c5e8 opentelemetry::v1::nostd::function_ref<>::BindTo<>()::{lambda()#1}::_FUN()
@ 0x564f4bf7604d opentelemetry::v1::nostd::function_ref<>::operator()()
@ 0x564f4bf74ad9 opentelemetry::v1::sdk::metrics::MeterContext::ForEachMeter()
@ 0x564f4bf8c457 opentelemetry::v1::sdk::metrics::MetricCollector::Collect()
@ 0x564f4bf4a7fe opentelemetry::v1::sdk::metrics::MetricReader::Collect()
@ 0x564f4bed5e24 opentelemetry::v1::exporter::metrics::PrometheusCollector::Collect()
@ 0x564f4bef004f prometheus::detail::CollectMetrics()
@ 0x564f4beec26d prometheus::detail::MetricsHandler::handleGet()
@ 0x564f4bf1cd8b CivetServer::requestHandler()
@ 0x564f4bf35e7b handle_request
@ 0x564f4bf29534 handle_request_stat_log
@ 0x564f4bf39b3f process_new_connection
@ 0x564f4bf3a448 worker_thread_run
@ 0x564f4bf3a57f worker_thread
@ 0x7f93e9137ea7 start_thread
[mutex.cc : 1454] RAW: dying due to potential deadlock
Aborted
```
From the stack, it looks like we are ending up holding a lock to the `RlsLB` policy while removing a callback from the gRPC OpenTelemetry plugin, which is a lock ordering inversion. The correct order is `OpenTelemetry` -> `gRPC OpenTelemetry plugin` -> `gRPC Component like RLS/xDSClient`.
A common pattern we employ for metrics is for the callbacks to be unregistered when the corresponding component object is orphaned/destroyed (unreffing). Also, note that removing callbacks requires a lock in `gRPC OpenTelemetry plugin`. To avoid deadlocks, we remove the callback inside `RlsLb` from outside the critical region, but `RlsLb` owns refs to child policies which in turn hold refs to `XdsClient`. The lock ordering inversion occurred due to unreffing child policies within the critical region.
This PR is an alternative fix to this problem. Original fix in grpc#37425.
Verified that it fixes the bug.
Closes grpc#37459
COPYBARA_INTEGRATE_REVIEW=grpc#37459 from yashykt:FixDeadlocks ec7fbcf
PiperOrigin-RevId: 6633604271 parent a09aaf0 commit 7407dbf
1 file changed
+90
-33
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
353 | 353 | | |
354 | 354 | | |
355 | 355 | | |
356 | | - | |
| 356 | + | |
| 357 | + | |
357 | 358 | | |
358 | 359 | | |
359 | 360 | | |
| |||
397 | 398 | | |
398 | 399 | | |
399 | 400 | | |
400 | | - | |
| 401 | + | |
401 | 402 | | |
402 | 403 | | |
403 | 404 | | |
404 | 405 | | |
405 | 406 | | |
406 | 407 | | |
407 | | - | |
| 408 | + | |
408 | 409 | | |
409 | 410 | | |
410 | 411 | | |
| |||
503 | 504 | | |
504 | 505 | | |
505 | 506 | | |
506 | | - | |
| 507 | + | |
| 508 | + | |
507 | 509 | | |
508 | 510 | | |
509 | 511 | | |
510 | 512 | | |
511 | 513 | | |
| 514 | + | |
| 515 | + | |
| 516 | + | |
| 517 | + | |
| 518 | + | |
| 519 | + | |
| 520 | + | |
| 521 | + | |
| 522 | + | |
| 523 | + | |
| 524 | + | |
| 525 | + | |
512 | 526 | | |
513 | 527 | | |
514 | 528 | | |
| |||
566 | 580 | | |
567 | 581 | | |
568 | 582 | | |
569 | | - | |
| 583 | + | |
| 584 | + | |
| 585 | + | |
570 | 586 | | |
571 | 587 | | |
572 | 588 | | |
573 | 589 | | |
574 | 590 | | |
575 | | - | |
| 591 | + | |
| 592 | + | |
| 593 | + | |
576 | 594 | | |
577 | 595 | | |
578 | 596 | | |
579 | 597 | | |
580 | 598 | | |
581 | | - | |
| 599 | + | |
| 600 | + | |
582 | 601 | | |
583 | 602 | | |
584 | 603 | | |
| |||
594 | 613 | | |
595 | 614 | | |
596 | 615 | | |
597 | | - | |
| 616 | + | |
| 617 | + | |
| 618 | + | |
598 | 619 | | |
599 | 620 | | |
600 | 621 | | |
| |||
857 | 878 | | |
858 | 879 | | |
859 | 880 | | |
860 | | - | |
| 881 | + | |
| 882 | + | |
861 | 883 | | |
862 | 884 | | |
863 | 885 | | |
| |||
880 | 902 | | |
881 | 903 | | |
882 | 904 | | |
883 | | - | |
| 905 | + | |
884 | 906 | | |
885 | 907 | | |
886 | 908 | | |
| |||
934 | 956 | | |
935 | 957 | | |
936 | 958 | | |
| 959 | + | |
937 | 960 | | |
938 | 961 | | |
939 | | - | |
940 | 962 | | |
941 | 963 | | |
942 | 964 | | |
| |||
946 | 968 | | |
947 | 969 | | |
948 | 970 | | |
949 | | - | |
| 971 | + | |
| 972 | + | |
950 | 973 | | |
951 | 974 | | |
952 | 975 | | |
| |||
1194 | 1217 | | |
1195 | 1218 | | |
1196 | 1219 | | |
| 1220 | + | |
1197 | 1221 | | |
1198 | 1222 | | |
1199 | 1223 | | |
1200 | 1224 | | |
1201 | 1225 | | |
1202 | 1226 | | |
| 1227 | + | |
1203 | 1228 | | |
1204 | 1229 | | |
1205 | 1230 | | |
1206 | 1231 | | |
1207 | 1232 | | |
1208 | | - | |
1209 | 1233 | | |
1210 | 1234 | | |
1211 | 1235 | | |
| |||
1284 | 1308 | | |
1285 | 1309 | | |
1286 | 1310 | | |
1287 | | - | |
| 1311 | + | |
| 1312 | + | |
1288 | 1313 | | |
1289 | 1314 | | |
1290 | 1315 | | |
| |||
1345 | 1370 | | |
1346 | 1371 | | |
1347 | 1372 | | |
1348 | | - | |
| 1373 | + | |
1349 | 1374 | | |
1350 | 1375 | | |
1351 | 1376 | | |
| |||
1382 | 1407 | | |
1383 | 1408 | | |
1384 | 1409 | | |
1385 | | - | |
| 1410 | + | |
| 1411 | + | |
| 1412 | + | |
1386 | 1413 | | |
1387 | 1414 | | |
1388 | 1415 | | |
1389 | 1416 | | |
1390 | | - | |
| 1417 | + | |
| 1418 | + | |
1391 | 1419 | | |
1392 | 1420 | | |
1393 | 1421 | | |
| |||
1405 | 1433 | | |
1406 | 1434 | | |
1407 | 1435 | | |
1408 | | - | |
| 1436 | + | |
| 1437 | + | |
| 1438 | + | |
1409 | 1439 | | |
1410 | 1440 | | |
1411 | 1441 | | |
1412 | | - | |
| 1442 | + | |
1413 | 1443 | | |
1414 | 1444 | | |
1415 | 1445 | | |
| |||
1419 | 1449 | | |
1420 | 1450 | | |
1421 | 1451 | | |
1422 | | - | |
| 1452 | + | |
| 1453 | + | |
| 1454 | + | |
| 1455 | + | |
| 1456 | + | |
| 1457 | + | |
1423 | 1458 | | |
1424 | 1459 | | |
1425 | 1460 | | |
| |||
1429 | 1464 | | |
1430 | 1465 | | |
1431 | 1466 | | |
| 1467 | + | |
1432 | 1468 | | |
1433 | 1469 | | |
1434 | 1470 | | |
| |||
1464 | 1500 | | |
1465 | 1501 | | |
1466 | 1502 | | |
| 1503 | + | |
| 1504 | + | |
1467 | 1505 | | |
1468 | 1506 | | |
1469 | 1507 | | |
1470 | 1508 | | |
1471 | 1509 | | |
1472 | 1510 | | |
| 1511 | + | |
1473 | 1512 | | |
1474 | 1513 | | |
1475 | 1514 | | |
| |||
1483 | 1522 | | |
1484 | 1523 | | |
1485 | 1524 | | |
1486 | | - | |
| 1525 | + | |
| 1526 | + | |
| 1527 | + | |
1487 | 1528 | | |
1488 | 1529 | | |
1489 | 1530 | | |
| |||
1494 | 1535 | | |
1495 | 1536 | | |
1496 | 1537 | | |
| 1538 | + | |
1497 | 1539 | | |
1498 | 1540 | | |
1499 | 1541 | | |
| |||
1814 | 1856 | | |
1815 | 1857 | | |
1816 | 1858 | | |
| 1859 | + | |
| 1860 | + | |
| 1861 | + | |
1817 | 1862 | | |
1818 | 1863 | | |
1819 | 1864 | | |
1820 | 1865 | | |
1821 | | - | |
| 1866 | + | |
| 1867 | + | |
1822 | 1868 | | |
1823 | | - | |
| 1869 | + | |
| 1870 | + | |
1824 | 1871 | | |
1825 | 1872 | | |
1826 | 1873 | | |
| |||
1999 | 2046 | | |
2000 | 2047 | | |
2001 | 2048 | | |
| 2049 | + | |
| 2050 | + | |
| 2051 | + | |
2002 | 2052 | | |
2003 | 2053 | | |
2004 | 2054 | | |
| |||
2010 | 2060 | | |
2011 | 2061 | | |
2012 | 2062 | | |
2013 | | - | |
| 2063 | + | |
| 2064 | + | |
2014 | 2065 | | |
2015 | 2066 | | |
2016 | 2067 | | |
2017 | 2068 | | |
2018 | 2069 | | |
2019 | 2070 | | |
2020 | | - | |
| 2071 | + | |
2021 | 2072 | | |
2022 | 2073 | | |
2023 | 2074 | | |
2024 | 2075 | | |
2025 | | - | |
| 2076 | + | |
2026 | 2077 | | |
2027 | 2078 | | |
2028 | 2079 | | |
| |||
2097 | 2148 | | |
2098 | 2149 | | |
2099 | 2150 | | |
2100 | | - | |
2101 | | - | |
2102 | | - | |
| 2151 | + | |
| 2152 | + | |
| 2153 | + | |
| 2154 | + | |
| 2155 | + | |
| 2156 | + | |
| 2157 | + | |
| 2158 | + | |
| 2159 | + | |
| 2160 | + | |
| 2161 | + | |
| 2162 | + | |
| 2163 | + | |
2103 | 2164 | | |
2104 | | - | |
2105 | | - | |
2106 | | - | |
2107 | | - | |
2108 | 2165 | | |
2109 | 2166 | | |
2110 | 2167 | | |
| |||
0 commit comments