-
Notifications
You must be signed in to change notification settings - Fork 6
SYSTEM PRESHUTDOWN command for graceful shutdown swarm node #852
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: antalya-25.3
Are you sure you want to change the base?
Changes from all commits
9642c42
42c201c
49cf8bf
b75ce28
8310a92
5c3c509
27ff656
9a66f65
40f4765
5476f61
86cac4f
00c3954
d3df0ad
a7254d2
e4c48fa
d19ead0
b998184
e820d88
5105455
14d42eb
870025d
cb0bf72
eed67f8
5f00c10
650b0f0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -185,6 +185,7 @@ namespace CurrentMetrics | |
extern const Metric IcebergCatalogThreads; | ||
extern const Metric IcebergCatalogThreadsActive; | ||
extern const Metric IcebergCatalogThreadsScheduled; | ||
extern const Metric IsSwarmModeEnabled; | ||
} | ||
|
||
|
||
|
@@ -566,6 +567,7 @@ struct ContextSharedPart : boost::noncopyable | |
std::map<String, UInt16> server_ports; | ||
|
||
std::atomic<bool> shutdown_called = false; | ||
std::atomic<bool> swarm_mode_enabled = true; | ||
|
||
Stopwatch uptime_watch TSA_GUARDED_BY(mutex); | ||
|
||
|
@@ -734,6 +736,7 @@ struct ContextSharedPart : boost::noncopyable | |
*/ | ||
void shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS | ||
{ | ||
swarm_mode_enabled = false; | ||
bool is_shutdown_called = shutdown_called.exchange(true); | ||
if (is_shutdown_called) | ||
return; | ||
|
@@ -4481,7 +4484,6 @@ std::shared_ptr<Cluster> Context::getCluster(const std::string & cluster_name) c | |
throw Exception(ErrorCodes::CLUSTER_DOESNT_EXIST, "Requested cluster '{}' not found", cluster_name); | ||
} | ||
|
||
|
||
std::shared_ptr<Cluster> Context::tryGetCluster(const std::string & cluster_name) const | ||
{ | ||
std::shared_ptr<Cluster> res = nullptr; | ||
|
@@ -4500,6 +4502,21 @@ std::shared_ptr<Cluster> Context::tryGetCluster(const std::string & cluster_name | |
return res; | ||
} | ||
|
||
void Context::unregisterInDynamicClusters() | ||
{ | ||
std::lock_guard lock(shared->clusters_mutex); | ||
if (!shared->cluster_discovery) | ||
return; | ||
shared->cluster_discovery->unregisterAll(); | ||
} | ||
|
||
void Context::registerInDynamicClusters() | ||
{ | ||
std::lock_guard lock(shared->clusters_mutex); | ||
if (!shared->cluster_discovery) | ||
return; | ||
shared->cluster_discovery->registerAll(); | ||
} | ||
|
||
void Context::reloadClusterConfig() const | ||
{ | ||
|
@@ -5350,12 +5367,27 @@ void Context::stopServers(const ServerType & server_type) const | |
shared->stop_servers_callback(server_type); | ||
} | ||
|
||
|
||
void Context::shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS | ||
{ | ||
shared->shutdown(); | ||
} | ||
|
||
void Context::stopSwarmMode() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The individual operations of this function are atomic, but the function itself is not. Can't this cause problems? For instance, consider two STOP/START queries running in different threads: thread1 -> stop swarm mode thread2 -> start swarm mode thread 1 -> resume The resulting state is: swarm mode boolean = true, metrics = false. |
||
{ | ||
CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0); | ||
shared->swarm_mode_enabled = false; | ||
} | ||
|
||
void Context::startSwarmMode() | ||
{ | ||
shared->swarm_mode_enabled = true; | ||
CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 1); | ||
} | ||
|
||
bool Context::isSwarmModeEnabled() const | ||
{ | ||
return shared->swarm_mode_enabled; | ||
} | ||
|
||
Context::ApplicationType Context::getApplicationType() const | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -693,6 +693,20 @@ BlockIO InterpreterSystemQuery::execute() | |
case Type::START_MOVES: | ||
startStopAction(ActionLocks::PartsMove, true); | ||
break; | ||
case Type::STOP_SWARM_MODE: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps this should also be a single atomic operation? |
||
{ | ||
getContext()->checkAccess(AccessType::SYSTEM_SWARM); | ||
getContext()->stopSwarmMode(); | ||
getContext()->unregisterInDynamicClusters(); | ||
break; | ||
} | ||
case Type::START_SWARM_MODE: | ||
{ | ||
getContext()->checkAccess(AccessType::SYSTEM_SWARM); | ||
getContext()->registerInDynamicClusters(); | ||
getContext()->startSwarmMode(); | ||
break; | ||
} | ||
case Type::STOP_FETCHES: | ||
startStopAction(ActionLocks::PartsFetch, false); | ||
break; | ||
|
@@ -1564,6 +1578,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() | |
required_access.emplace_back(AccessType::SYSTEM_MOVES, query.getDatabase(), query.getTable()); | ||
break; | ||
} | ||
case Type::STOP_SWARM_MODE: | ||
case Type::START_SWARM_MODE: | ||
{ | ||
required_access.emplace_back(AccessType::SYSTEM_SWARM); | ||
break; | ||
} | ||
case Type::STOP_PULLING_REPLICATION_LOG: | ||
case Type::START_PULLING_REPLICATION_LOG: | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ namespace ErrorCodes | |
extern const int CANNOT_READ_FROM_SOCKET; | ||
extern const int CANNOT_OPEN_FILE; | ||
extern const int SOCKET_TIMEOUT; | ||
extern const int ATTEMPT_TO_READ_AFTER_EOF; | ||
} | ||
|
||
RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext( | ||
|
@@ -56,16 +57,32 @@ void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, Sus | |
|
||
while (true) | ||
{ | ||
read_context.has_read_packet_part = PacketPart::None; | ||
|
||
if (read_context.read_packet_type_separately) | ||
try | ||
{ | ||
read_context.has_read_packet_part = PacketPart::None; | ||
|
||
if (read_context.read_packet_type_separately) | ||
{ | ||
read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback); | ||
read_context.has_read_packet_part = PacketPart::Type; | ||
suspend_callback(); | ||
} | ||
read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback); | ||
read_context.has_read_packet_part = PacketPart::Body; | ||
if (read_context.packet.type == Protocol::Server::Data) | ||
read_context.has_data_packets = true; | ||
} | ||
catch (const Exception & e) | ||
{ | ||
read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback); | ||
read_context.has_read_packet_part = PacketPart::Type; | ||
suspend_callback(); | ||
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you please add a comment explaining what The same for the below if statement |
||
&& !read_context.has_data_packets.load() && read_context.executor.skipUnavailableShards()) | ||
{ | ||
read_context.has_read_packet_part = PacketPart::None; | ||
} | ||
else | ||
throw; | ||
} | ||
read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback); | ||
read_context.has_read_packet_part = PacketPart::Body; | ||
|
||
suspend_callback(); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry for my ignorance.
What does "dynamic" cluster actually mean?