diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index 36b84ca5cd8f..86410d712ec7 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -19,7 +19,7 @@ target_link_libraries(clickhouse-library-bridge PRIVATE daemon dbms bridge - clickhouse_functions_extractkeyvaluepairs + clickhouse_functions ) set_target_properties(clickhouse-library-bridge PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..) diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index 0d105dafc92d..97ec9cbab438 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -22,7 +22,7 @@ target_link_libraries(clickhouse-odbc-bridge PRIVATE dbms bridge clickhouse_parsers - clickhouse_functions_extractkeyvaluepairs + clickhouse_functions ch_contrib::nanodbc ch_contrib::unixodbc ) diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index 792dacc93ba7..0402b0ec579e 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -5951,6 +5951,9 @@ This only affects operations performed on the client side, in particular parsing Normally this setting should be set in user profile (users.xml or queries like `ALTER USER`), not through the client (client command line arguments, `SET` query, or `SETTINGS` section of `SELECT` query). Through the client it can be changed to false, but can't be changed to true (because the server won't send the settings if user profile has `apply_settings_from_server = false`). Note that initially (24.12) there was a server setting (`send_settings_to_client`), but latter it got replaced with this client setting, for better usability. +)", 0) \ + DECLARE(Bool, object_storage_treat_key_related_wildcards_as_star, false, R"( +Upon reading from object storage (e.g, s3, azure and etc), treat {_snowflake_id} and {_partition_id} as *. This will allow symmetrical reads and writes using a single table. )", 0) \ \ /* ####################################################### */ \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index e26edbd4d424..003b22be4700 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -64,6 +64,12 @@ const VersionToSettingsChangesMap & getSettingsChangesHistory() /// controls new feature and it's 'true' by default, use 'false' as previous_value). /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) /// Note: please check if the key already exists to prevent duplicate entries. + + addSettingsChanges(settings_changes_history, "25.3", + { + // Altinity Antalya modifications atop of 25.3 + {"object_storage_treat_key_related_wildcards_as_star", false, false, "New setting."}, + }); addSettingsChanges(settings_changes_history, "25.2.1.20000", { // Altinity Antalya modifications atop of 25.2 diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 33fc7be1b841..c9180314af80 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -147,6 +147,13 @@ struct Data } +uint64_t generateSnowflakeID() +{ + Data data; + SnowflakeId snowflake_id = data.reserveRange(getMachineId(), 1); + return fromSnowflakeId(snowflake_id); +} + class FunctionGenerateSnowflakeID : public IFunction { public: diff --git a/src/Functions/generateSnowflakeID.h b/src/Functions/generateSnowflakeID.h new file mode 100644 index 000000000000..954a919c2dee --- /dev/null +++ b/src/Functions/generateSnowflakeID.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +uint64_t generateSnowflakeID(); + +} diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h index 5acb42d2a7dc..3ffc242a2367 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h @@ -50,10 +50,18 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl public: using Configuration = StorageObjectStorage::Configuration; + DataLakeConfiguration() = default; + + DataLakeConfiguration(const DataLakeConfiguration & other) + : BaseStorageConfiguration(other) + , current_metadata(other.current_metadata->clone()) {} + bool isDataLakeConfiguration() const override { return true; } std::string getEngineName() const override { return DataLakeMetadata::name + BaseStorageConfiguration::getEngineName(); } + StorageObjectStorage::ConfigurationPtr clone() override { return std::make_shared(*this); } + void update(ObjectStoragePtr object_storage, ContextPtr local_context) override { BaseStorageConfiguration::update(object_storage, local_context); diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h index 95c527c560ce..9df8631b7b2d 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.h @@ -39,6 +39,19 @@ class DeltaLakeMetadata final : public IDataLakeMetadata DeltaLakeMetadata(ObjectStoragePtr object_storage_, ConfigurationObserverPtr configuration_, ContextPtr context_); + DeltaLakeMetadata(const DeltaLakeMetadata & other) + { + object_storage = other.object_storage; + data_files = other.data_files; + schema = other.schema; + partition_columns = other.partition_columns; + } + + std::unique_ptr clone() override + { + return std::make_unique(*this); + } + Strings getDataFiles() const override { return data_files; } NamesAndTypesList getTableSchema() const override { return schema; } diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h index 5147e358c1e1..04632af9908e 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadataDeltaKernel.h @@ -37,10 +37,17 @@ class DeltaLakeMetadataDeltaKernel final : public IDataLakeMetadata ContextPtr context_, bool read_schema_same_as_table_schema_); + DeltaLakeMetadataDeltaKernel(const DeltaLakeMetadataDeltaKernel & other) : log(other.log), table_snapshot(other.table_snapshot) {} + bool supportsUpdate() const override { return true; } bool update(const ContextPtr & context) override; + std::unique_ptr clone() override + { + return std::make_unique(*this); + } + Strings getDataFiles() const override; NamesAndTypesList getTableSchema() const override; diff --git a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h index 5b515dc1f37e..d9f0f104ef4b 100644 --- a/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/HudiMetadata.h @@ -19,6 +19,10 @@ class HudiMetadata final : public IDataLakeMetadata, private WithContext HudiMetadata(ObjectStoragePtr object_storage_, ConfigurationObserverPtr configuration_, ContextPtr context_); + HudiMetadata(const HudiMetadata & other) + : WithContext(other.getContext()), object_storage(other.object_storage), configuration(other.configuration), data_files(other.data_files) + {} + Strings getDataFiles() const override; NamesAndTypesList getTableSchema() const override { return {}; } @@ -39,6 +43,11 @@ class HudiMetadata final : public IDataLakeMetadata, private WithContext return std::make_unique(object_storage, configuration, local_context); } + std::unique_ptr clone() override + { + return std::make_unique(*this); + } + protected: ObjectIterator iterate( const ActionsDAG * filter_dag, diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h index 1984750351bc..d64653a84d61 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.h @@ -52,6 +52,8 @@ class IDataLakeMetadata : boost::noncopyable virtual std::optional totalRows() const { return {}; } virtual std::optional totalBytes() const { return {}; } + virtual std::unique_ptr clone() = 0; + protected: ObjectIterator createKeysIterator( Strings && data_files_, diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h index 23b0cd04b897..f98055094cc2 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h @@ -42,6 +42,28 @@ class IcebergMetadata : public IDataLakeMetadata, private WithContext const Poco::JSON::Object::Ptr & metadata_object, IcebergMetadataFilesCachePtr cache_ptr); + IcebergMetadata(const IcebergMetadata & other) + : WithContext(other.getContext()) + , object_storage(other.object_storage) + , configuration(other.configuration) + , schema_processor(other.schema_processor.clone()) + , log(other.log) + , manifest_cache(other.manifest_cache) + , manifest_file_by_data_file(other.manifest_file_by_data_file) + , last_metadata_version(other.last_metadata_version) + , last_metadata_object(other.last_metadata_object) + , format_version(other.format_version) + , relevant_snapshot_schema_id(other.relevant_snapshot_schema_id) + , relevant_snapshot(other.relevant_snapshot) + , relevant_snapshot_id(other.relevant_snapshot_id) + , table_location(other.table_location) + {} + + std::unique_ptr clone() override + { + return std::make_unique(*this); + } + /// Get data files. On first request it reads manifest_list file and iterates through manifest files to find all data files. /// All subsequent calls when the same data snapshot is relevant will return saved list of files (because it cannot be changed /// without changing metadata file). Drops on every snapshot update. diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h index 311d723c83b2..736d996cb897 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/SchemaProcessor.h @@ -76,6 +76,24 @@ class IcebergSchemaProcessor using Node = ActionsDAG::Node; public: + IcebergSchemaProcessor() = default; + + IcebergSchemaProcessor clone() + { + std::lock_guard lock(mutex); + + IcebergSchemaProcessor ret; + + ret.iceberg_table_schemas_by_ids = iceberg_table_schemas_by_ids; + ret.clickhouse_table_schemas_by_ids = clickhouse_table_schemas_by_ids; + ret.transform_dags_by_ids = transform_dags_by_ids; + ret.clickhouse_types_by_source_ids = clickhouse_types_by_source_ids; + ret.clickhouse_ids_by_source_names = clickhouse_ids_by_source_names; + ret.current_schema_id = current_schema_id; + + return ret; + } + void addIcebergTableSchema(Poco::JSON::Object::Ptr schema_ptr); std::shared_ptr getClickhouseTableSchemaById(Int32 id); std::shared_ptr getSchemaTransformationDagByIds(Int32 old_id, Int32 new_id); @@ -87,6 +105,15 @@ class IcebergSchemaProcessor bool hasClickhouseTableSchemaById(Int32 id) const; private: + IcebergSchemaProcessor(const IcebergSchemaProcessor & other) + : iceberg_table_schemas_by_ids(other.iceberg_table_schemas_by_ids), + clickhouse_table_schemas_by_ids(other.clickhouse_table_schemas_by_ids), + transform_dags_by_ids(other.transform_dags_by_ids), + clickhouse_types_by_source_ids(other.clickhouse_types_by_source_ids), + clickhouse_ids_by_source_names(other.clickhouse_ids_by_source_names), + current_schema_id(other.current_schema_id) + {} + std::unordered_map iceberg_table_schemas_by_ids; std::unordered_map> clickhouse_table_schemas_by_ids; std::map, std::shared_ptr> transform_dags_by_ids; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index eea47cbf9044..b55c39e22ee2 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -36,6 +36,7 @@ namespace Setting extern const SettingsMaxThreads max_threads; extern const SettingsBool optimize_count_from_files; extern const SettingsBool use_hive_partitioning; + extern const SettingsBool object_storage_treat_key_related_wildcards_as_star; } namespace ErrorCodes @@ -373,21 +374,35 @@ void StorageObjectStorage::read( if (update_configuration_on_read) configuration->update(object_storage, local_context); - if (partition_by && configuration->withPartitionWildcard()) + auto config_clone = configuration->clone(); + + if (config_clone->withPartitionWildcard() || config_clone->withSnowflakeIdWildcard()) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Reading from a partitioned {} storage is not implemented yet", - getName()); + /* + * Replace `_partition_id` and `_snowflake_id` wildcards with `*` so that any files that match this pattern can be retrieved. + */ + if (local_context->getSettingsRef()[Setting::object_storage_treat_key_related_wildcards_as_star]) + { + config_clone->setPath(getPathWithKeyRelatedWildcardsReplacedWithStar(config_clone->getPath())); + } + + if (config_clone->withPartitionWildcard() || config_clone->withSnowflakeIdWildcard()) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Reading from a globbed path {} on storage {} is not implemented yet," + "except when the only globs are `_snowflake_id` and/or `_partition_id` with `object_storage_treat_key_related_wildcards_as_star=1`", + config_clone->getPath(), getName()); + } } - const auto read_from_format_info = configuration->prepareReadingFromFormat( + const auto read_from_format_info = config_clone->prepareReadingFromFormat( object_storage, column_names, storage_snapshot, supportsSubsetOfColumns(local_context), local_context); const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef()[Setting::optimize_count_from_files]; auto read_step = std::make_unique( object_storage, - configuration, + config_clone, fmt::format("{}({})", getName(), getStorageID().getFullTableName()), column_names, getVirtualsList(), @@ -421,7 +436,7 @@ SinkToStoragePtr StorageObjectStorage::write( configuration->getPath()); } - if (configuration->withGlobsIgnorePartitionWildcard()) + if (configuration->withGlobsIgnorePartitionWildcardAndSnowflakeIdWildcard()) { throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "Path '{}' contains globs, so the table is in readonly mode", @@ -650,6 +665,7 @@ StorageObjectStorage::Configuration::Configuration(const Configuration & other) format = other.format; compression_method = other.compression_method; structure = other.structure; + storage_settings = other.storage_settings; } bool StorageObjectStorage::Configuration::withPartitionWildcard() const @@ -659,6 +675,13 @@ bool StorageObjectStorage::Configuration::withPartitionWildcard() const || getNamespace().find(PARTITION_ID_WILDCARD) != String::npos; } +bool StorageObjectStorage::Configuration::withSnowflakeIdWildcard() const +{ + static const String PARTITION_ID_WILDCARD = "{_snowflake_id}"; + return getPath().find(PARTITION_ID_WILDCARD) != String::npos + || getNamespace().find(PARTITION_ID_WILDCARD) != String::npos; +} + bool StorageObjectStorage::Configuration::withGlobsIgnorePartitionWildcard() const { if (!withPartitionWildcard()) @@ -666,6 +689,15 @@ bool StorageObjectStorage::Configuration::withGlobsIgnorePartitionWildcard() con return PartitionedSink::replaceWildcards(getPath(), "").find_first_of("*?{") != std::string::npos; } +bool StorageObjectStorage::Configuration::withGlobsIgnorePartitionWildcardAndSnowflakeIdWildcard() const +{ + const auto path_without_partition_id_wildcard = PartitionedSink::replaceWildcards(getPath(), ""); + + const auto path_without_snowflake_id_wildcard = replaceSnowflakeIdWildcard(path_without_partition_id_wildcard, ""); + + return path_without_snowflake_id_wildcard.find_first_of("*?{") != std::string::npos; +} + bool StorageObjectStorage::Configuration::isPathWithGlobs() const { return getPath().find_first_of("*?{") != std::string::npos; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 2aef633a64d4..1b41e49c87e1 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -206,8 +206,10 @@ class StorageObjectStorage::Configuration ASTs & args, const String & structure_, const String & format_, ContextPtr context, bool with_structure) = 0; virtual bool withPartitionWildcard() const; + virtual bool withSnowflakeIdWildcard() const; bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } virtual bool withGlobsIgnorePartitionWildcard() const; + virtual bool withGlobsIgnorePartitionWildcardAndSnowflakeIdWildcard() const; virtual bool isPathWithGlobs() const; virtual bool isNamespaceWithGlobs() const; virtual std::string getPathWithoutGlobs() const; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index 263188301877..5f22e564766a 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -20,6 +21,22 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +namespace +{ +void validateKey(const String & str) +{ + /// See: + /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html + /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject + + if (str.empty() || str.size() > 1024) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); + + if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); +} +} + StorageObjectStorageSink::StorageObjectStorageSink( ObjectStoragePtr object_storage, ConfigurationPtr configuration, @@ -31,7 +48,11 @@ StorageObjectStorageSink::StorageObjectStorageSink( , sample_block(sample_block_) { const auto & settings = context->getSettingsRef(); - const auto path = blob_path.empty() ? configuration->getPaths().back() : blob_path; + auto path = blob_path.empty() ? configuration->getPaths().back() : blob_path; + path = fillSnowflakeIdWildcard(path); + + validateKey(path); + const auto chosen_compression_method = chooseCompressionMethod(path, configuration->getCompressionMethod()); auto buffer = object_storage->writeObject( @@ -121,6 +142,8 @@ StorageObjectStorageSink::~StorageObjectStorageSink() SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String & partition_id) { + validatePartitionKey(partition_id, true); + auto partition_bucket = replaceWildcards(configuration->getNamespace(), partition_id); validateNamespace(partition_bucket); @@ -143,21 +166,6 @@ SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String ); } -void PartitionedStorageObjectStorageSink::validateKey(const String & str) -{ - /// See: - /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html - /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject - - if (str.empty() || str.size() > 1024) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); - - if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); - - validatePartitionKey(str, true); -} - void PartitionedStorageObjectStorageSink::validateNamespace(const String & str) { configuration->validateNamespace(str); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index 97fd3d9b4179..50bc2c4299fe 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -52,7 +52,6 @@ class PartitionedStorageObjectStorageSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override; private: - void validateKey(const String & str); void validateNamespace(const String & str); ObjectStoragePtr object_storage; diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index d4f152bfd582..50f19d069c6c 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -1,10 +1,19 @@ #include #include #include +#include +#include +#include +#include namespace DB { +namespace Setting +{ + extern const SettingsBool object_storage_treat_key_related_wildcards_as_star; +} + namespace ErrorCodes { extern const int BAD_ARGUMENTS; @@ -51,6 +60,15 @@ void resolveSchemaAndFormat( std::string & sample_path, const ContextPtr & context) { + /* + * Replace `_partition_id` and `_snowflake_id` wildcards with `*` so that any files that match this pattern can be retrieved. + */ + const auto old_path = configuration->getPath(); + if (context->getSettingsRef()[Setting::object_storage_treat_key_related_wildcards_as_star]) + { + configuration->setPath(getPathWithKeyRelatedWildcardsReplacedWithStar(configuration->getPath())); + } + if (columns.empty()) { if (configuration->getFormat() == "auto") @@ -68,6 +86,9 @@ void resolveSchemaAndFormat( configuration->setFormat(StorageObjectStorage::resolveFormatFromData(object_storage, configuration, format_settings, sample_path, context)); } + // restored globbed path + configuration->setPath(old_path); + if (!columns.hasOnlyOrdinary()) { /// We don't allow special columns. @@ -77,4 +98,19 @@ void resolveSchemaAndFormat( } } +std::string replaceSnowflakeIdWildcard(const std::string & input, const std::string & snowflake_id) +{ + return boost::replace_all_copy(input, "{_snowflake_id}", snowflake_id); +} + +std::string fillSnowflakeIdWildcard(const std::string & input) +{ + return replaceSnowflakeIdWildcard(input, std::to_string(generateSnowflakeID())); +} + +std::string getPathWithKeyRelatedWildcardsReplacedWithStar(const std::string & path) +{ + return replaceSnowflakeIdWildcard(PartitionedSink::replaceWildcards(path, "*"), "*"); +} + } diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h index 17e30babb709..12ec0c52cf91 100644 --- a/src/Storages/ObjectStorage/Utils.h +++ b/src/Storages/ObjectStorage/Utils.h @@ -21,4 +21,10 @@ void resolveSchemaAndFormat( std::string & sample_path, const ContextPtr & context); +std::string replaceSnowflakeIdWildcard(const std::string & input, const std::string & snowflake_id); + +std::string fillSnowflakeIdWildcard(const std::string & input); + +std::string getPathWithKeyRelatedWildcardsReplacedWithStar(const std::string & path); + } diff --git a/tests/queries/0_stateless/03378_object_storage_snowflake_id_wildcard.reference b/tests/queries/0_stateless/03378_object_storage_snowflake_id_wildcard.reference new file mode 100644 index 000000000000..415f8dcd8f10 --- /dev/null +++ b/tests/queries/0_stateless/03378_object_storage_snowflake_id_wildcard.reference @@ -0,0 +1,44 @@ +test/t_03378_parquet/(2022,\'USA\')/.parquet 1 +test/t_03378_parquet/(2022,\'Canada\')/.parquet 2 +test/t_03378_parquet/(2023,\'USA\')/.parquet 3 +test/t_03378_parquet/(2023,\'Mexico\')/.parquet 4 +test/t_03378_parquet/(2024,\'France\')/.parquet 5 +test/t_03378_parquet/(2024,\'Germany\')/.parquet 6 +test/t_03378_parquet/(2024,\'Germany\')/.parquet 7 +test/t_03378_parquet/(1999,\'Brazil\')/.parquet 8 +test/t_03378_parquet/(2100,\'Japan\')/.parquet 9 +test/t_03378_parquet/(2024,\'CN\')/.parquet 10 +test/t_03378_parquet/(2025,\'\')/.parquet 11 +test/t_03378_csv/(2022,\'USA\')/.csv 1 +test/t_03378_csv/(2022,\'Canada\')/.csv 2 +test/t_03378_csv/(2023,\'USA\')/.csv 3 +test/t_03378_csv/(2023,\'Mexico\')/.csv 4 +test/t_03378_csv/(2024,\'France\')/.csv 5 +test/t_03378_csv/(2024,\'Germany\')/.csv 6 +test/t_03378_csv/(2024,\'Germany\')/.csv 7 +test/t_03378_csv/(1999,\'Brazil\')/.csv 8 +test/t_03378_csv/(2100,\'Japan\')/.csv 9 +test/t_03378_csv/(2024,\'CN\')/.csv 10 +test/t_03378_csv/(2025,\'\')/.csv 11 +test/t_03378_function/(2022,\'USA\')/.parquet 1 +test/t_03378_function/(2022,\'Canada\')/.parquet 2 +test/t_03378_function/(2023,\'USA\')/.parquet 3 +test/t_03378_function/(2023,\'Mexico\')/.parquet 4 +test/t_03378_function/(2024,\'France\')/.parquet 5 +test/t_03378_function/(2024,\'Germany\')/.parquet 6 +test/t_03378_function/(2024,\'Germany\')/.parquet 7 +test/t_03378_function/(1999,\'Brazil\')/.parquet 8 +test/t_03378_function/(2100,\'Japan\')/.parquet 9 +test/t_03378_function/(2024,\'CN\')/.parquet 10 +test/t_03378_function/(2025,\'\')/.parquet 11 +test/t_03378_function/(2022,\'USA\')/.parquet 1 +test/t_03378_function/(2022,\'Canada\')/.parquet 2 +test/t_03378_function/(2023,\'USA\')/.parquet 3 +test/t_03378_function/(2023,\'Mexico\')/.parquet 4 +test/t_03378_function/(2024,\'France\')/.parquet 5 +test/t_03378_function/(2024,\'Germany\')/.parquet 6 +test/t_03378_function/(2024,\'Germany\')/.parquet 7 +test/t_03378_function/(1999,\'Brazil\')/.parquet 8 +test/t_03378_function/(2100,\'Japan\')/.parquet 9 +test/t_03378_function/(2024,\'CN\')/.parquet 10 +test/t_03378_function/(2025,\'\')/.parquet 11 diff --git a/tests/queries/0_stateless/03378_object_storage_snowflake_id_wildcard.sql b/tests/queries/0_stateless/03378_object_storage_snowflake_id_wildcard.sql new file mode 100644 index 000000000000..ca51e3be2984 --- /dev/null +++ b/tests/queries/0_stateless/03378_object_storage_snowflake_id_wildcard.sql @@ -0,0 +1,44 @@ +DROP TABLE IF EXISTS t_03378_parquet, t_03378_csv; + +CREATE TABLE t_03378_parquet (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03378_parquet/{_partition_id}/{_snowflake_id}.parquet', format = Parquet) +PARTITION BY (year, country); + +INSERT INTO t_03378_parquet VALUES + (2022, 'USA', 1), + (2022, 'Canada', 2), + (2023, 'USA', 3), + (2023, 'Mexico', 4), + (2024, 'France', 5), + (2024, 'Germany', 6), + (2024, 'Germany', 7), + (1999, 'Brazil', 8), + (2100, 'Japan', 9), + (2024, 'CN', 10), + (2025, '', 11); + +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from t_03378_parquet order by counter SETTINGS object_storage_treat_key_related_wildcards_as_star=1; + +CREATE TABLE t_03378_csv (year UInt16, country String, counter UInt8) +ENGINE = S3(s3_conn, filename = 't_03378_csv/{_partition_id}/{_snowflake_id}.csv', format = CSV) +PARTITION BY (year, country); + +INSERT INTO t_03378_csv VALUES + (2022, 'USA', 1), + (2022, 'Canada', 2), + (2023, 'USA', 3), + (2023, 'Mexico', 4), + (2024, 'France', 5), + (2024, 'Germany', 6), + (2024, 'Germany', 7), + (1999, 'Brazil', 8), + (2100, 'Japan', 9), + (2024, 'CN', 10), + (2025, '', 11); + +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03378_csv order by counter SETTINGS object_storage_treat_key_related_wildcards_as_star=1; + +-- s3 table function +INSERT INTO FUNCTION s3(s3_conn, filename='t_03378_function/{_partition_id}/{_snowflake_id}.parquet', format=Parquet) PARTITION BY (year, country) SELECT country, year, counter FROM t_03378_parquet SETTINGS object_storage_treat_key_related_wildcards_as_star=1;; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03378_function/**.parquet') order by counter SETTINGS object_storage_treat_key_related_wildcards_as_star=1; +select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03378_function/{_partition_id}/{_snowflake_id}.parquet') order by counter SETTINGS object_storage_treat_key_related_wildcards_as_star=1;