Skip to content

Commit 15f79be

Browse files
committed
Fix SIGABRT when destroying OdCDS handles on worker threads
The OdCdsApiHandleImpl destructor can be invoked on worker threads when filters are removed (e.g., during VHDS filter updates). When the last reference to an OdCdsApiSharedPtr is destroyed on a worker thread, the subscription cleanup code attempts to run on the wrong thread, causing an assertion failure or SIGABRT since subscription operations must execute on the main thread. This fix adds a destructor to OdCdsApiHandleImpl that checks if destruction is happening on a worker thread. If so, it dispatches the OdCdsApiSharedPtr destruction to the main thread dispatcher via a lambda, ensuring the subscription is properly cleaned up on the thread where it was created. The fix follows the same pattern used elsewhere in Envoy (e.g., ThreadLocal::SlotImpl destructor) for cross-thread cleanup operations. Note: A longer-term optimization could cache and reuse OdCDS handles in ClusterManager with reference counting and deferred cleanup, avoiding unnecessary subscription teardown/recreation during filter rotations. This is tracked by the existing TODO in allocateOdCdsApi(). Signed-off-by: William Dauchy <[email protected]>
1 parent 6602938 commit 15f79be

File tree

2 files changed

+74
-0
lines changed

2 files changed

+74
-0
lines changed

source/common/upstream/cluster_manager_impl.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "envoy/upstream/cluster_manager.h"
3131

3232
#include "source/common/common/cleanup.h"
33+
#include "source/common/common/thread.h"
3334
#include "source/common/http/async_client_impl.h"
3435
#include "source/common/http/http_server_properties_cache_impl.h"
3536
#include "source/common/http/http_server_properties_cache_manager_impl.h"
@@ -481,6 +482,24 @@ class ClusterManagerImpl : public ClusterManager,
481482
ASSERT(odcds_ != nullptr);
482483
}
483484

485+
~OdCdsApiHandleImpl() override {
486+
// The OdCdsApiSharedPtr contains a subscription that must be destroyed on the main thread.
487+
// If we're being destroyed on a worker thread, we need to dispatch the destruction to the
488+
// main thread dispatcher. Otherwise, the subscription cleanup will trigger an assertion
489+
// failure or SIGABRT.
490+
if (!Thread::MainThread::isMainThread() && odcds_ != nullptr) {
491+
// Move the shared_ptr into a lambda that will be executed on the main thread.
492+
// This ensures the subscription is destroyed on the main thread where it was created.
493+
parent_.dispatcher_.post([odcds = std::move(odcds_)]() {
494+
// The shared_ptr will be destroyed here on the main thread, ensuring proper cleanup
495+
// of the subscription.
496+
});
497+
// Reset the member pointer since we've moved ownership to the lambda.
498+
odcds_.reset();
499+
}
500+
// If we're on the main thread, the normal destructor will handle cleanup.
501+
}
502+
484503
ClusterDiscoveryCallbackHandlePtr
485504
requestOnDemandClusterDiscovery(absl::string_view name, ClusterDiscoveryCallbackPtr callback,
486505
std::chrono::milliseconds timeout) override {

test/common/upstream/odcd_test.cc

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
#include <chrono>
22

3+
#include "envoy/api/api.h"
34
#include "envoy/upstream/cluster_manager.h"
45

6+
#include "source/common/common/thread.h"
57
#include "source/common/config/xds_resource.h"
68

79
#include "test/common/upstream/cluster_manager_impl_test_common.h"
810
#include "test/mocks/upstream/od_cds_api.h"
11+
#include "test/test_common/utility.h"
912

1013
#include "gmock/gmock.h"
1114
#include "gtest/gtest.h"
1215

16+
using testing::Return;
17+
1318
namespace Envoy {
1419
namespace Upstream {
1520
namespace {
@@ -275,6 +280,56 @@ TEST_F(ODCDTest, TestMainThreadDiscoveryInProgressDetection) {
275280
odcds_handle_->requestOnDemandClusterDiscovery("cluster_foo", std::move(cb2), timeout_);
276281
}
277282

283+
// Test that destroying an OdCdsApiHandle from a worker thread properly dispatches cleanup
284+
// to the main thread, preventing SIGABRT. This simulates the scenario where a filter is
285+
// removed during VHDS updates, causing the handle to be destroyed on a worker thread.
286+
TEST_F(ODCDTest, TestDestroyHandleFromWorkerThread) {
287+
// Create a handle on the main thread (simulating filter creation)
288+
auto handle_to_destroy = cluster_manager_->createOdCdsApiHandle(odcds_);
289+
290+
// Override the MockDispatcher's post() behavior to allow cross-thread posting.
291+
// The default mock behavior asserts that callbacks run on the same thread, but
292+
// our fix intentionally posts from worker thread to main thread. Allow multiple
293+
// calls since there may be other posts happening during destruction.
294+
EXPECT_CALL(factory_.dispatcher_, post(_))
295+
.WillRepeatedly([]() {
296+
// Allow the post to succeed without asserting - the callback will be
297+
// executed later on the main thread.
298+
});
299+
300+
// Create a worker thread to destroy the handle (simulating filter removal on worker thread)
301+
bool destruction_completed = false;
302+
Api::ApiPtr api = Api::createApiForTest();
303+
Event::DispatcherPtr worker_dispatcher(api->allocateDispatcher("test_worker_thread"));
304+
305+
Thread::ThreadPtr worker_thread = Thread::threadFactoryForTest().createThread(
306+
[&handle_to_destroy, &destruction_completed, &worker_dispatcher]() {
307+
// Skip thread assertions since we're intentionally on worker thread
308+
Thread::SkipAsserts skip;
309+
310+
// Verify we're on a worker thread
311+
EXPECT_FALSE(Thread::MainThread::isMainThread());
312+
313+
// Destroy the handle from worker thread - this should dispatch cleanup to main thread
314+
// via dispatcher.post() and not cause SIGABRT. The fix ensures the OdCdsApiSharedPtr
315+
// destruction happens on the main thread where the subscription was created.
316+
handle_to_destroy.reset();
317+
destruction_completed = true;
318+
319+
// Run the dispatcher to process any posted callbacks
320+
worker_dispatcher->run(Event::Dispatcher::RunType::NonBlock);
321+
});
322+
323+
// Wait for worker thread to complete
324+
worker_thread->join();
325+
326+
// Verify destruction completed without crash
327+
EXPECT_TRUE(destruction_completed);
328+
329+
// Run main dispatcher to process cleanup callbacks that were posted from worker thread
330+
factory_.tls_.dispatcher().run(Event::Dispatcher::RunType::NonBlock);
331+
}
332+
278333
} // namespace
279334
} // namespace Upstream
280335
} // namespace Envoy

0 commit comments

Comments
 (0)