Skip to content

SYSTEM PRESHUTDOWN command for graceful shutdown swarm node #852

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 25 commits into
base: antalya-25.3
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9642c42
SYSTEM PRESHUTDOWN to allow graceful shutdown node
ianton-ru Jun 6, 2025
42c201c
Fix tests
ianton-ru Jun 12, 2025
49cf8bf
I hate coroutines
ianton-ru Jun 13, 2025
b75ce28
Merge branch 'antalya-25.3' into feature/system_preshutdown
ianton-ru Jun 13, 2025
8310a92
Change PRESUTDOWN on STOP SWARM command
ianton-ru Jun 25, 2025
5c3c509
Using Altinity's branding instead of upstream's
Enmk Jun 25, 2025
27ff656
Updated packages metadata
Enmk Jun 25, 2025
9a66f65
Minor fixups
Enmk Jun 26, 2025
40f4765
Fixed maintainer information in packages
Enmk Jun 26, 2025
5476f61
Compact favicon
Enmk Jun 26, 2025
86cac4f
Altinity branding and colors for play.html
Enmk Jun 26, 2025
00c3954
Minor: Typo fix + other small changes
Enmk Jun 26, 2025
d3df0ad
Update github-repo
Enmk Jun 26, 2025
a7254d2
Update binary.html
Enmk Jun 26, 2025
e4c48fa
lock_object_storage_task_distribution_ms setting
ianton-ru Jun 12, 2025
d19ead0
Remove timeouts
ianton-ru Jun 23, 2025
b998184
Faster test
ianton-ru Jun 24, 2025
e820d88
Moved changes to 25.3 section
Enmk Jul 1, 2025
5105455
Fix after review
ianton-ru Jul 2, 2025
14d42eb
Allow data and metadata by different paths
ianton-ru Jun 12, 2025
870025d
Cut bucket from path
ianton-ru Jun 16, 2025
cb0bf72
Iceberg catalog with S3 tables
ianton-ru Jun 19, 2025
eed67f8
Dirty workaround to resolve correct endpoint in HEAD requests
ianton-ru Jun 20, 2025
5f00c10
IsSwarmModeEnabled metric
ianton-ru Jul 3, 2025
650b0f0
Merge branch 'antalya-25.3' into feature/system_preshutdown
ianton-ru Jul 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/en/sql-reference/statements/system.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,12 @@ SYSTEM RELOAD USERS [ON CLUSTER cluster_name]

Normally shuts down ClickHouse (like `service clickhouse-server stop` / `kill {$pid_clickhouse-server}`)

## PRESHUTDOWN {#preshutdown}

<CloudNotSupportedBadge/>

Prepare node for graceful shutdown. Unregister in autodiscovered clusters, stop accepting distributed requests to object storages (s3Cluster, icebergCluster, etc.).

## KILL {#kill}

Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`)
Expand Down
4 changes: 4 additions & 0 deletions programs/server/Server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2261,6 +2261,8 @@ try

}

global_context->startSwarmMode();

{
std::lock_guard lock(servers_lock);
/// We should start interserver communications before (and more important shutdown after) tables.
Expand Down Expand Up @@ -2689,6 +2691,8 @@ try

is_cancelled = true;

global_context->stopSwarmMode();

LOG_DEBUG(log, "Waiting for current connections to close.");

size_t current_connections = 0;
Expand Down
1 change: 1 addition & 0 deletions src/Access/Common/AccessType.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ enum class AccessType : uint8_t
M(SYSTEM_TTL_MERGES, "SYSTEM STOP TTL MERGES, SYSTEM START TTL MERGES, STOP TTL MERGES, START TTL MERGES", TABLE, SYSTEM) \
M(SYSTEM_FETCHES, "SYSTEM STOP FETCHES, SYSTEM START FETCHES, STOP FETCHES, START FETCHES", TABLE, SYSTEM) \
M(SYSTEM_MOVES, "SYSTEM STOP MOVES, SYSTEM START MOVES, STOP MOVES, START MOVES", TABLE, SYSTEM) \
M(SYSTEM_SWARM, "SYSTEM STOP SWARM MODE, SYSTEM START SWARM MODE, STOP SWARM MODE, START SWARM MODE", GLOBAL, SYSTEM) \
M(SYSTEM_PULLING_REPLICATION_LOG, "SYSTEM STOP PULLING REPLICATION LOG, SYSTEM START PULLING REPLICATION LOG", TABLE, SYSTEM) \
M(SYSTEM_CLEANUP, "SYSTEM STOP CLEANUP, SYSTEM START CLEANUP", TABLE, SYSTEM) \
M(SYSTEM_VIEWS, "SYSTEM REFRESH VIEW, SYSTEM START VIEWS, SYSTEM STOP VIEWS, SYSTEM START VIEW, SYSTEM STOP VIEW, SYSTEM CANCEL VIEW, REFRESH VIEW, START VIEWS, STOP VIEWS, START VIEW, STOP VIEW, CANCEL VIEW", VIEW, SYSTEM) \
Expand Down
1 change: 1 addition & 0 deletions src/Common/CurrentMetrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@
M(StartupScriptsExecutionState, "State of startup scripts execution: 0 = not finished, 1 = success, 2 = failure.") \
\
M(IsServerShuttingDown, "Indicates if the server is shutting down: 0 = no, 1 = yes") \
M(IsSwarmModeEnabled, "Indicates if the swarm mode enabled or not: 0 = disabled, 1 = enabled") \

#ifdef APPLY_FOR_EXTERNAL_METRICS
#define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)
Expand Down
40 changes: 39 additions & 1 deletion src/Interpreters/ClusterDiscovery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,9 @@ bool ClusterDiscovery::upsertCluster(ClusterInfo & cluster_info)
return true;
};

if (!cluster_info.current_node_is_observer && !contains(node_uuids, current_node_name))
if (!cluster_info.current_node_is_observer
&& context->isSwarmModeEnabled()
&& !contains(node_uuids, current_node_name))
{
LOG_ERROR(log, "Can't find current node in cluster '{}', will register again", cluster_info.name);
registerInZk(zk, cluster_info);
Expand Down Expand Up @@ -454,12 +456,30 @@ void ClusterDiscovery::registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & inf
return;
}

if (!context->isSwarmModeEnabled())
{
LOG_DEBUG(log, "STOP SWARM MODE called, skip self-registering current node {} in cluster {}", current_node_name, info.name);
return;
}

LOG_DEBUG(log, "Registering current node {} in cluster {}", current_node_name, info.name);

zk->createOrUpdate(node_path, info.current_node.serialize(), zkutil::CreateMode::Ephemeral);
LOG_DEBUG(log, "Current node {} registered in cluster {}", current_node_name, info.name);
}

void ClusterDiscovery::unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info)
{
if (info.current_node_is_observer)
return;

String node_path = getShardsListPath(info.zk_root) / current_node_name;
LOG_DEBUG(log, "Removing current node {} from cluster {}", current_node_name, info.name);

zk->remove(node_path);
LOG_DEBUG(log, "Current node {} removed from cluster {}", current_node_name, info.name);
}

void ClusterDiscovery::initialUpdate()
{
LOG_DEBUG(log, "Initializing");
Expand Down Expand Up @@ -505,6 +525,24 @@ void ClusterDiscovery::initialUpdate()
is_initialized = true;
}

void ClusterDiscovery::registerAll()
{
for (auto & [_, info] : clusters_info)
{
auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name);
registerInZk(zk, info);
}
}

void ClusterDiscovery::unregisterAll()
{
for (auto & [_, info] : clusters_info)
{
auto zk = context->getDefaultOrAuxiliaryZooKeeper(info.zk_name);
unregisterFromZk(zk, info);
}
}

void ClusterDiscovery::findDynamicClusters(
std::unordered_map<String, ClusterDiscovery::ClusterInfo> & info,
std::unordered_set<size_t> * unchanged_roots)
Expand Down
4 changes: 4 additions & 0 deletions src/Interpreters/ClusterDiscovery.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class ClusterDiscovery

~ClusterDiscovery();

void registerAll();
void unregisterAll();

private:
struct NodeInfo
{
Expand Down Expand Up @@ -125,6 +128,7 @@ class ClusterDiscovery
void initialUpdate();

void registerInZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info);
void unregisterFromZk(zkutil::ZooKeeperPtr & zk, ClusterInfo & info);

Strings getNodeNames(zkutil::ZooKeeperPtr & zk,
const String & zk_root,
Expand Down
36 changes: 34 additions & 2 deletions src/Interpreters/Context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ namespace CurrentMetrics
extern const Metric IcebergCatalogThreads;
extern const Metric IcebergCatalogThreadsActive;
extern const Metric IcebergCatalogThreadsScheduled;
extern const Metric IsSwarmModeEnabled;
}


Expand Down Expand Up @@ -566,6 +567,7 @@ struct ContextSharedPart : boost::noncopyable
std::map<String, UInt16> server_ports;

std::atomic<bool> shutdown_called = false;
std::atomic<bool> swarm_mode_enabled = true;

Stopwatch uptime_watch TSA_GUARDED_BY(mutex);

Expand Down Expand Up @@ -734,6 +736,7 @@ struct ContextSharedPart : boost::noncopyable
*/
void shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS
{
swarm_mode_enabled = false;
bool is_shutdown_called = shutdown_called.exchange(true);
if (is_shutdown_called)
return;
Expand Down Expand Up @@ -4481,7 +4484,6 @@ std::shared_ptr<Cluster> Context::getCluster(const std::string & cluster_name) c
throw Exception(ErrorCodes::CLUSTER_DOESNT_EXIST, "Requested cluster '{}' not found", cluster_name);
}


std::shared_ptr<Cluster> Context::tryGetCluster(const std::string & cluster_name) const
{
std::shared_ptr<Cluster> res = nullptr;
Expand All @@ -4500,6 +4502,21 @@ std::shared_ptr<Cluster> Context::tryGetCluster(const std::string & cluster_name
return res;
}

void Context::unregisterInDynamicClusters()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for my ignorance.

What does "dynamic" cluster actually mean?

{
std::lock_guard lock(shared->clusters_mutex);
if (!shared->cluster_discovery)
return;
shared->cluster_discovery->unregisterAll();
}

void Context::registerInDynamicClusters()
{
std::lock_guard lock(shared->clusters_mutex);
if (!shared->cluster_discovery)
return;
shared->cluster_discovery->registerAll();
}

void Context::reloadClusterConfig() const
{
Expand Down Expand Up @@ -5350,12 +5367,27 @@ void Context::stopServers(const ServerType & server_type) const
shared->stop_servers_callback(server_type);
}


void Context::shutdown() TSA_NO_THREAD_SAFETY_ANALYSIS
{
shared->shutdown();
}

void Context::stopSwarmMode()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The individual operations of this function are atomic, but the function itself is not. Can't this cause problems?

For instance, consider two STOP/START queries running in different threads:

thread1 -> stop swarm mode
metrics updated, it is now false
context switch

thread2 -> start swarm mode
metrics updated, it is now true
swarm mode enabled

thread 1 -> resume
swarm mode disabled

The resulting state is: swarm mode boolean = true, metrics = false.

{
CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 0);
shared->swarm_mode_enabled = false;
}

void Context::startSwarmMode()
{
shared->swarm_mode_enabled = true;
CurrentMetrics::set(CurrentMetrics::IsSwarmModeEnabled, 1);
}

bool Context::isSwarmModeEnabled() const
{
return shared->swarm_mode_enabled;
}

Context::ApplicationType Context::getApplicationType() const
{
Expand Down
7 changes: 7 additions & 0 deletions src/Interpreters/Context.h
Original file line number Diff line number Diff line change
Expand Up @@ -1225,6 +1225,8 @@ class Context: public ContextData, public std::enable_shared_from_this<Context>
size_t getClustersVersion() const;

void startClusterDiscovery();
void registerInDynamicClusters();
void unregisterInDynamicClusters();

/// Sets custom cluster, but doesn't update configuration
void setCluster(const String & cluster_name, const std::shared_ptr<Cluster> & cluster);
Expand Down Expand Up @@ -1335,6 +1337,11 @@ class Context: public ContextData, public std::enable_shared_from_this<Context>

void shutdown();

/// Stop some works to allow graceful shutdown later
void stopSwarmMode();
void startSwarmMode();
bool isSwarmModeEnabled() const;

bool isInternalQuery() const { return is_internal_query; }
void setInternalQuery(bool internal) { is_internal_query = internal; }

Expand Down
20 changes: 20 additions & 0 deletions src/Interpreters/InterpreterSystemQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -693,6 +693,20 @@ BlockIO InterpreterSystemQuery::execute()
case Type::START_MOVES:
startStopAction(ActionLocks::PartsMove, true);
break;
case Type::STOP_SWARM_MODE:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this should also be a single atomic operation?

{
getContext()->checkAccess(AccessType::SYSTEM_SWARM);
getContext()->stopSwarmMode();
getContext()->unregisterInDynamicClusters();
break;
}
case Type::START_SWARM_MODE:
{
getContext()->checkAccess(AccessType::SYSTEM_SWARM);
getContext()->registerInDynamicClusters();
getContext()->startSwarmMode();
break;
}
case Type::STOP_FETCHES:
startStopAction(ActionLocks::PartsFetch, false);
break;
Expand Down Expand Up @@ -1564,6 +1578,12 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
required_access.emplace_back(AccessType::SYSTEM_MOVES, query.getDatabase(), query.getTable());
break;
}
case Type::STOP_SWARM_MODE:
case Type::START_SWARM_MODE:
{
required_access.emplace_back(AccessType::SYSTEM_SWARM);
break;
}
case Type::STOP_PULLING_REPLICATION_LOG:
case Type::START_PULLING_REPLICATION_LOG:
{
Expand Down
2 changes: 2 additions & 0 deletions src/Parsers/ASTSystemQuery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,8 @@ void ASTSystemQuery::formatImpl(WriteBuffer & ostr, const FormatSettings & setti
case Type::DROP_PAGE_CACHE:
case Type::STOP_REPLICATED_DDL_QUERIES:
case Type::START_REPLICATED_DDL_QUERIES:
case Type::STOP_SWARM_MODE:
case Type::START_SWARM_MODE:
break;
case Type::UNKNOWN:
case Type::END:
Expand Down
2 changes: 2 additions & 0 deletions src/Parsers/ASTSystemQuery.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class ASTSystemQuery : public IAST, public ASTQueryWithOnCluster
START_FETCHES,
STOP_MOVES,
START_MOVES,
STOP_SWARM_MODE,
START_SWARM_MODE,
STOP_REPLICATED_SENDS,
START_REPLICATED_SENDS,
STOP_REPLICATION_QUEUES,
Expand Down
5 changes: 5 additions & 0 deletions src/QueryPipeline/RemoteQueryExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,11 @@ void RemoteQueryExecutor::setProfileInfoCallback(ProfileInfoCallback callback)
profile_info_callback = std::move(callback);
}

bool RemoteQueryExecutor::skipUnavailableShards() const
{
return context->getSettingsRef()[Setting::skip_unavailable_shards];
}

bool RemoteQueryExecutor::needToSkipUnavailableShard() const
{
return context->getSettingsRef()[Setting::skip_unavailable_shards] && (0 == connections->size());
Expand Down
2 changes: 2 additions & 0 deletions src/QueryPipeline/RemoteQueryExecutor.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ class RemoteQueryExecutor

IConnections & getConnections() { return *connections; }

bool skipUnavailableShards() const;

bool needToSkipUnavailableShard() const;

bool isReplicaUnavailable() const { return extension && extension->parallel_reading_coordinator && connections->size() == 0; }
Expand Down
33 changes: 25 additions & 8 deletions src/QueryPipeline/RemoteQueryExecutorReadContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ namespace ErrorCodes
extern const int CANNOT_READ_FROM_SOCKET;
extern const int CANNOT_OPEN_FILE;
extern const int SOCKET_TIMEOUT;
extern const int ATTEMPT_TO_READ_AFTER_EOF;
}

RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(
Expand Down Expand Up @@ -56,16 +57,32 @@ void RemoteQueryExecutorReadContext::Task::run(AsyncCallback async_callback, Sus

while (true)
{
read_context.has_read_packet_part = PacketPart::None;

if (read_context.read_packet_type_separately)
try
{
read_context.has_read_packet_part = PacketPart::None;

if (read_context.read_packet_type_separately)
{
read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback);
read_context.has_read_packet_part = PacketPart::Type;
suspend_callback();
}
read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback);
read_context.has_read_packet_part = PacketPart::Body;
if (read_context.packet.type == Protocol::Server::Data)
read_context.has_data_packets = true;
}
catch (const Exception & e)
{
read_context.packet.type = read_context.executor.getConnections().receivePacketTypeUnlocked(async_callback);
read_context.has_read_packet_part = PacketPart::Type;
suspend_callback();
if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add a comment explaining what ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF usually means and why you are catching it?

The same for the below if statement

&& !read_context.has_data_packets.load() && read_context.executor.skipUnavailableShards())
{
read_context.has_read_packet_part = PacketPart::None;
}
else
throw;
}
read_context.packet = read_context.executor.getConnections().receivePacketUnlocked(async_callback);
read_context.has_read_packet_part = PacketPart::Body;

suspend_callback();
}
}
Expand Down
1 change: 1 addition & 0 deletions src/QueryPipeline/RemoteQueryExecutorReadContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ class RemoteQueryExecutorReadContext : public AsyncTaskExecutor
/// None -> Type -> Body -> None
/// None -> Body -> None
std::atomic<PacketPart> has_read_packet_part = PacketPart::None;
std::atomic_bool has_data_packets = false;
Packet packet;

RemoteQueryExecutor & executor;
Expand Down
Loading
Loading