From 143d08941495448a8c43e5ccdb9354552e970b0d Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Fri, 18 Apr 2025 16:38:04 +0200 Subject: [PATCH] Merge pull request #742 from Altinity/feature/lazy_load_metadata Make DataLake metadata more lazy --- src/Disks/ObjectStorages/IObjectStorage.cpp | 18 ++++++++++++++++++ src/Disks/ObjectStorages/IObjectStorage.h | 2 ++ .../DataLakes/IDataLakeMetadata.cpp | 10 +++++++--- .../ObjectStorage/ReadBufferIterator.cpp | 8 +++----- .../StorageObjectStorageSource.cpp | 16 +--------------- 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index da10528bbedf..59a4856c69bb 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -97,4 +97,22 @@ WriteSettings IObjectStorage::patchSettings(const WriteSettings & write_settings return write_settings; } + +void RelativePathWithMetadata::loadMetadata(ObjectStoragePtr object_storage, bool ignore_non_existent_file) +{ + if (!metadata) + { + const auto & path = isArchive() ? getPathToArchive() : getPath(); + + if (ignore_non_existent_file) + { + metadata = object_storage->tryGetObjectMetadata(path); + } + else + { + metadata = object_storage->getObjectMetadata(path); + } + } +} + } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 0d9464b1ad7e..e6bf5433a664 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -100,6 +100,8 @@ struct RelativePathWithMetadata virtual bool isArchive() const { return false; } virtual std::string getPathToArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } virtual size_t fileSizeInArchive() const { throw Exception(ErrorCodes::LOGICAL_ERROR, "Not an archive"); } + + void loadMetadata(ObjectStoragePtr object_storage, bool ignore_non_existent_file); }; struct ObjectKeyWithMetadata diff --git a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp index c0dce62becaf..df4f5ed3a45b 100644 --- a/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/IDataLakeMetadata.cpp @@ -34,12 +34,16 @@ class KeysIterator : public IObjectIterator return nullptr; auto key = data_files[current_index]; - auto object_metadata = object_storage->getObjectMetadata(key); if (callback) - callback(FileProgress(0, object_metadata.size_bytes)); + { + /// Too expencive to load size for metadata always + /// because it requires API call to external storage. + /// In many cases only keys are needed. + callback(FileProgress(0, 1)); + } - return std::make_shared(key, std::move(object_metadata)); + return std::make_shared(key, std::nullopt); } } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index bc2a43af9fd1..cce9cf71dcc3 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -75,10 +75,7 @@ std::optional ReadBufferIterator::tryGetColumnsFromCache( const auto & object_info = (*it); auto get_last_mod_time = [&] -> std::optional { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - if (!object_info->metadata) - object_info->metadata = object_storage->tryGetObjectMetadata(path); - + object_info->loadMetadata(object_storage); return object_info->metadata ? std::optional(object_info->metadata->last_modified.epochTime()) : std::nullopt; @@ -150,7 +147,6 @@ std::unique_ptr ReadBufferIterator::recreateLastReadBuffer() { auto context = getContext(); - const auto & path = current_object_info->isArchive() ? current_object_info->getPathToArchive() : current_object_info->getPath(); auto impl = StorageObjectStorageSource::createReadBuffer(*current_object_info, object_storage, context, getLogger("ReadBufferIterator")); const auto compression_method = chooseCompressionMethod(current_object_info->getFileName(), configuration->compression_method); @@ -249,6 +245,8 @@ ReadBufferIterator::Data ReadBufferIterator::next() prev_read_keys_size = read_keys.size(); } + current_object_info->loadMetadata(object_storage); + if (query_settings.skip_empty_files && current_object_info->metadata && current_object_info->metadata->size_bytes == 0) continue; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 4e10bd12e40d..7faec2ad898c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -413,21 +413,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade if (!object_info || object_info->getPath().empty()) return {}; - if (!object_info->metadata) - { - const auto & path = object_info->isArchive() ? object_info->getPathToArchive() : object_info->getPath(); - - if (query_settings.ignore_non_existent_file) - { - auto metadata = object_storage->tryGetObjectMetadata(path); - if (!metadata) - return {}; - - object_info->metadata = metadata; - } - else - object_info->metadata = object_storage->getObjectMetadata(path); - } + object_info->loadMetadata(object_storage, query_settings.ignore_non_existent_file); } while (query_settings.skip_empty_files && object_info->metadata->size_bytes == 0);