-
Notifications
You must be signed in to change notification settings - Fork 4.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make SlotImpl detachable from its owner, and add a new runOnAllThreads interface to Slot. #8135
Changes from 8 commits
a896a66
e7436c6
6fe9bcb
2d1538c
fe349bd
11e1d85
038cb7c
0b5292c
76fbf31
e59f479
d9db13f
9458453
4c05c8b
07a3bbc
b02bc34
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,8 +27,9 @@ SlotPtr InstanceImpl::allocateSlot() { | |
|
||
if (free_slot_indexes_.empty()) { | ||
std::unique_ptr<SlotImpl> slot(new SlotImpl(*this, slots_.size())); | ||
slots_.push_back(slot.get()); | ||
return slot; | ||
auto wrapper = std::make_unique<Bookkeeper>(*this, std::move(slot)); | ||
slots_.push_back(&wrapper->slot()); | ||
return wrapper; | ||
} | ||
const uint32_t idx = free_slot_indexes_.front(); | ||
free_slot_indexes_.pop_front(); | ||
|
@@ -42,11 +43,59 @@ bool InstanceImpl::SlotImpl::currentThreadRegistered() { | |
return thread_local_data_.data_.size() > index_; | ||
} | ||
|
||
void InstanceImpl::SlotImpl::runOnAllThreads(const UpdateCb& cb) { | ||
parent_.runOnAllThreads([this, cb]() { setThreadLocal(index_, cb(get())); }); | ||
} | ||
|
||
void InstanceImpl::SlotImpl::runOnAllThreads(const UpdateCb& cb, Event::PostCb complete_cb) { | ||
parent_.runOnAllThreads([this, cb]() { setThreadLocal(index_, cb(get())); }, complete_cb); | ||
} | ||
|
||
ThreadLocalObjectSharedPtr InstanceImpl::SlotImpl::get() { | ||
ASSERT(currentThreadRegistered()); | ||
return thread_local_data_.data_[index_]; | ||
} | ||
|
||
InstanceImpl::Bookkeeper::Bookkeeper(InstanceImpl& parent, std::unique_ptr<SlotImpl>&& slot) | ||
: parent_(parent), holder_(std::make_unique<SlotHolder>(std::move(slot))) {} | ||
|
||
ThreadLocalObjectSharedPtr InstanceImpl::Bookkeeper::get() { return slot().get(); } | ||
|
||
void InstanceImpl::Bookkeeper::runOnAllThreads(const UpdateCb& cb, Event::PostCb complete_cb) { | ||
mattklein123 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
slot().runOnAllThreads( | ||
[cb, ref_count = holder_->ref_count_](ThreadLocalObjectSharedPtr previous) { | ||
return cb(std::move(previous)); | ||
}, | ||
complete_cb); | ||
} | ||
|
||
void InstanceImpl::Bookkeeper::runOnAllThreads(const UpdateCb& cb) { | ||
slot().runOnAllThreads( | ||
[cb, ref_count = holder_->ref_count_](ThreadLocalObjectSharedPtr previous) { | ||
return cb(std::move(previous)); | ||
}); | ||
} | ||
|
||
bool InstanceImpl::Bookkeeper::currentThreadRegistered() { | ||
return slot().currentThreadRegistered(); | ||
} | ||
|
||
void InstanceImpl::Bookkeeper::runOnAllThreads(Event::PostCb cb) { | ||
// Use holder_.ref_count_ to bookkeep how many on-the-fly callback are out there. | ||
slot().runOnAllThreads([cb, ref_count = holder_->ref_count_]() { cb(); }); | ||
} | ||
|
||
void InstanceImpl::Bookkeeper::runOnAllThreads(Event::PostCb cb, Event::PostCb main_callback) { | ||
// Use holder_.ref_count_ to bookkeep how many on-the-fly callback are out there. | ||
slot().runOnAllThreads([cb, main_callback, ref_count = holder_->ref_count_]() { cb(); }, | ||
main_callback); | ||
} | ||
|
||
void InstanceImpl::Bookkeeper::set(InitializeCb cb) { | ||
slot().set([cb, ref_count = holder_->ref_count_](Event::Dispatcher& dispatcher) | ||
-> ThreadLocalObjectSharedPtr { return cb(dispatcher); }); | ||
} | ||
|
||
void InstanceImpl::registerThread(Event::Dispatcher& dispatcher, bool main_thread) { | ||
ASSERT(std::this_thread::get_id() == main_thread_id_); | ||
ASSERT(!shutdown_); | ||
|
@@ -61,6 +110,40 @@ void InstanceImpl::registerThread(Event::Dispatcher& dispatcher, bool main_threa | |
} | ||
} | ||
|
||
// Deletes a Slot if it's recycleable(no on-the-fly callbacks points to it), otherwise puts it into | ||
// a deferred delete queue, and schedules a cleanup task to collect the Slot once it's recycleable. | ||
void InstanceImpl::recycle(std::unique_ptr<SlotHolder>&& holder) { | ||
if (holder->isRecycleable()) { | ||
holder.reset(); | ||
return; | ||
} | ||
deferred_deletes_.emplace_back(std::move(holder)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nothing tracks when this holder is eventually recyclable, right? Meaning, in this case it will stick around the deferred delete list until something else gets scheduled for cleanup? (Since presumably this schedule cleanup can race with the thing actually being recyclable). Maybe this is fine? Or maybe you should be checking for use count on the worker threads and scheduling cleanup if its detached and ready to be recycled (the last one)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's tracked:
So once an item is put into the queue, the above event-loop (1->2->2->2->....) will guarantee the items been cleaned up. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think the canceling/waiting approach requires more changes to the code base. Basically, if we want to be lock free, we'd have to split the LDS update (or Server shutdown) into 2 phases:
In the meantime, if it's a LDS update caused ListenerImpl teardown, we have to bookkeep all the TLS resources owned by "this" ListenerImpl, and only wait on them to be cleaned up, as a RDS subscription (or any other subscription is shared between Listeners). On the other hand, this change limit changes to tls module only, least disturbance to the code base. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
To be clear, this is how it already works. See There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I talked to @stevenzzzz offline and we are going to time box trying to implement explicit shutdown of resources prior to listener shutdown. This might be accomplished via the init manager or some similar structure. If it ends up being too hard or has complications we can come back to this. /wait There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd be in favor of not repeating something like init manager. Init manager is has turned out tobe hard to reason about, with a maze of callbacks, watchers, registration conditions, lifetime considerations and so on. History speaks for itself with all the bugs and races on this one. I think there is a good argument to be made that the solution in this PR has benefits in that it simplifies the contract around slot ownership, e.g. the So, my $0.02 is that something like the solution in this PR is a nice direction; basically getting away from having threads worrying about TLS slots directly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Throwing out another idea that has come up when chatting with @stevenzzzz; we could have a centralized (owned by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think with this solution there are going to be potentially hard to reason about thread interleavings, and I have no doubt we will have a different set of issues from this direction, though I agree it's a nice solution and it will work in the end. My reason for proposing the use of the existing init manager system is that it already exists. We must register things into the init manager, and I think it would be relatively trivial to also register a shutdown hook of some type at the same time. With that said, I don't feel that strongly about it, so if you both want to continue with this solution we can do that.
This is the way it already works? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good discussion here. The other problem I want to call out here is that no matter which way we go, we need to let contributors know about this problem, and we should not distributed (by capturing some shared_ptr) ownership of a tls slot or its owner to worker threads. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is my point exactly. Although I agree Init/ShutdownManager is problematic in certain ways, its extremely explicit on what needs to happen. Basically, don't do any more updates. We are trading that for rules around how this can be used which are hard to understand and enforce. With that, I will stop there. @htuch I assume you want to move forward with this solution? |
||
scheduleCleanup(); | ||
} | ||
|
||
// Cleans up the deferred deletes queue. | ||
// Removes all the items with ref count 1, and reschedule another cleanup task if there are left | ||
// un-recycleable items. | ||
// This way ensures that, unless server shuts down, an enqueued item will be | ||
// deleted eventually. | ||
void InstanceImpl::scheduleCleanup() { | ||
ASSERT(main_thread_dispatcher_ != nullptr); | ||
if (shutdown_) { | ||
return; | ||
} | ||
if (deferred_deletes_.empty()) { | ||
return; | ||
} | ||
main_thread_dispatcher_->post([this]() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I feel like this ought to be avoidable, but perhaps I'm missing something. We're using ref_count_ not as an integer, but just to have an object with a use count right? If we made refcount_ a class, and had Bookkeeper have a reference to it (along with runOnAllThreads calls), and the refcount destructor did a dispatcher post(delete)) couldn't we avoid this extra layer of clean up complexity? We'd still delete when both the Bookkeeper and all thread references were gone, and we'd still delete in the original thread. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So the problem with "posting back to main-thread when the propagation is done", there again is this race-window, where the posted callback in main-thread may gets run first, and then the "copy of lambda" in the worker thread may held the last reference of the refcount_. IMHO, We should also avoid to move the ownership of a slotImpl or its owner, by posting a callback/complete_cb which shares a portion of the object as well. That may cause complication when ServerImpl tears down, I went into a crash with that, where I posted shared_ptr back to main thread dispatcher queue, during shutdown, the SingletonManager is teared down ahead of the DispatcherImpl in ServerImpl, which includes the RouteConfigProviderManager that's referenced by the RdsRouteCnofigProvider in the dispatcher-msg-queue.
Yeah, SlotHolder is basically the obj that holds a slot and its outgoing ref-count. I made a class instead using a shared pointer as I thought it may be more "readable". the ferre There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, SingletonManager early destruction. With this restriction I don't have a good opinion... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Talked w/ Alyssa offline, get the idea now. I believe this matches what lamndai suggested as well. |
||
deferred_deletes_.remove_if( | ||
[](std::unique_ptr<SlotHolder>& holder) -> bool { return holder->isRecycleable(); }); | ||
if (!deferred_deletes_.empty()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefer a loop vs. recursion. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let me update the misleading comment. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks this makes more sense now. Can you add a bunch more comments inside There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ping on adding more comments. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops, missed this. |
||
// Reschedule another cleanup task if there are still non-recyclable slots. | ||
scheduleCleanup(); | ||
} | ||
}); | ||
} | ||
|
||
void InstanceImpl::removeSlot(SlotImpl& slot) { | ||
ASSERT(std::this_thread::get_id() == main_thread_id_); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,14 +8,15 @@ | |
#include "envoy/thread_local/thread_local.h" | ||
|
||
#include "common/common/logger.h" | ||
#include "common/common/non_copyable.h" | ||
|
||
namespace Envoy { | ||
namespace ThreadLocal { | ||
|
||
/** | ||
* Implementation of ThreadLocal that relies on static thread_local objects. | ||
*/ | ||
class InstanceImpl : Logger::Loggable<Logger::Id::main>, public Instance { | ||
class InstanceImpl : Logger::Loggable<Logger::Id::main>, public NonCopyable, public Instance { | ||
public: | ||
InstanceImpl() : main_thread_id_(std::this_thread::get_id()) {} | ||
~InstanceImpl() override; | ||
|
@@ -35,6 +36,8 @@ class InstanceImpl : Logger::Loggable<Logger::Id::main>, public Instance { | |
// ThreadLocal::Slot | ||
ThreadLocalObjectSharedPtr get() override; | ||
bool currentThreadRegistered() override; | ||
void runOnAllThreads(const UpdateCb& cb) override; | ||
void runOnAllThreads(const UpdateCb& cb, Event::PostCb complete_cb) override; | ||
void runOnAllThreads(Event::PostCb cb) override { parent_.runOnAllThreads(cb); } | ||
void runOnAllThreads(Event::PostCb cb, Event::PostCb main_callback) override { | ||
parent_.runOnAllThreads(cb, main_callback); | ||
|
@@ -45,17 +48,55 @@ class InstanceImpl : Logger::Loggable<Logger::Id::main>, public Instance { | |
const uint64_t index_; | ||
}; | ||
|
||
// A helper class for holding a SlotImpl and its bookkeeping shared_ptr which counts the number of | ||
// update callbacks on-the-fly. | ||
struct SlotHolder { | ||
SlotHolder(std::unique_ptr<SlotImpl>&& slot) : slot_(std::move(slot)) {} | ||
bool isRecycleable() { return ref_count_.use_count() == 1; } | ||
|
||
const std::unique_ptr<SlotImpl> slot_; | ||
const std::shared_ptr<int> ref_count_{new int(0)}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The shared_ptr as ref_count reminds me of the pattern I eliminated in envoy before. See #7011 |
||
}; | ||
|
||
// A Wrapper of SlotImpl which on destruction returns the SlotImpl to the deferred delete queue | ||
// (detaches it). | ||
struct Bookkeeper : public Slot { | ||
Bookkeeper(InstanceImpl& parent, std::unique_ptr<SlotImpl>&& slot); | ||
~Bookkeeper() override { parent_.recycle(std::move(holder_)); } | ||
SlotImpl& slot() { return *(holder_->slot_); } | ||
|
||
// ThreadLocal::Slot | ||
ThreadLocalObjectSharedPtr get() override; | ||
void runOnAllThreads(const UpdateCb& cb) override; | ||
void runOnAllThreads(const UpdateCb& cb, Event::PostCb complete_cb) override; | ||
bool currentThreadRegistered() override; | ||
void runOnAllThreads(Event::PostCb cb) override; | ||
void runOnAllThreads(Event::PostCb cb, Event::PostCb main_callback) override; | ||
void set(InitializeCb cb) override; | ||
|
||
InstanceImpl& parent_; | ||
std::unique_ptr<SlotHolder> holder_; | ||
}; | ||
|
||
struct ThreadLocalData { | ||
Event::Dispatcher* dispatcher_{}; | ||
std::vector<ThreadLocalObjectSharedPtr> data_; | ||
}; | ||
|
||
void recycle(std::unique_ptr<SlotHolder>&& holder); | ||
// Cleanup the deferred deletes queue. | ||
void scheduleCleanup(); | ||
void removeSlot(SlotImpl& slot); | ||
void runOnAllThreads(Event::PostCb cb); | ||
void runOnAllThreads(Event::PostCb cb, Event::PostCb main_callback); | ||
static void setThreadLocal(uint32_t index, ThreadLocalObjectSharedPtr object); | ||
|
||
static thread_local ThreadLocalData thread_local_data_; | ||
|
||
// A queue for Slots that has to be deferred to delete due to out-going callbacks | ||
// pointing to the Slot. | ||
std::list<std::unique_ptr<SlotHolder>> deferred_deletes_; | ||
|
||
std::vector<SlotImpl*> slots_; | ||
// A list of index of freed slots. | ||
std::list<uint32_t> free_slot_indexes_; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we remove tls_ as a member?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am not sure I follow your question, may I ask why?
tls_ is per subscription in config-provider framework, depends on config type (delta/non-delta), it creates per-worker/x-worker-shared ConfigImpl, which is shared among providers.