diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index ec8b6c1b32f..f759819bb2d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -565,6 +565,12 @@ if(ARROW_WITH_ZSTD) list(APPEND ARROW_UTIL_SRCS util/compression_zstd.cc) endif() +# ALP (for Parquet encoder/decoder) +list(APPEND ARROW_UTIL_SRCS + util/alp/Alp.cc + util/alp/AlpSampler.cc + util/alp/AlpWrapper.cc) + arrow_add_object_library(ARROW_UTIL ${ARROW_UTIL_SRCS}) # Disable DLL exports in vendored uriparser library diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index a41b63f07b3..6e27e5af273 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -103,6 +103,13 @@ add_arrow_test(bit-utility-test rle_encoding_test.cc test_common.cc) +add_arrow_test(alp-test + SOURCES + alp/alp_test.cc + alp/Alp.cc + alp/AlpSampler.cc + alp/AlpWrapper.cc) + add_arrow_test(crc32-test SOURCES crc32_test.cc diff --git a/cpp/src/arrow/util/alp/Alp.cc b/cpp/src/arrow/util/alp/Alp.cc new file mode 100644 index 00000000000..777e4b2c72a --- /dev/null +++ b/cpp/src/arrow/util/alp/Alp.cc @@ -0,0 +1,789 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/Alp.h" + +#include +#include +#include +#include +#include + +#include "arrow/util/alp/AlpConstants.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bpacking_internal.h" +#include "arrow/util/logging.h" +#include "arrow/util/small_vector.h" +#include "arrow/util/span.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +// ---------------------------------------------------------------------- +// AlpEncodedVectorInfo implementation + +bool AlpEncodedVectorInfo::operator==(const AlpEncodedVectorInfo& other) const { + return exponent_and_factor == other.exponent_and_factor && + frame_of_reference == other.frame_of_reference && + bit_width == other.bit_width && bit_packed_size == other.bit_packed_size && + num_elements == other.num_elements && num_exceptions == other.num_exceptions; +} + +void AlpEncodedVectorInfo::Store(arrow::util::span output_buffer) const { + ARROW_CHECK(output_buffer.size() >= GetStoredSize()) + << "alp_vector_info_output_too_small: " << output_buffer.size() << " vs " + << GetStoredSize(); + + std::memcpy(output_buffer.data(), this, GetStoredSize()); +} + +AlpEncodedVectorInfo AlpEncodedVectorInfo::Load( + arrow::util::span input_buffer) { + ARROW_CHECK(input_buffer.size() >= GetStoredSize()) + << "alp_vector_info_input_too_small: " << input_buffer.size() << " vs " + << GetStoredSize(); + + AlpEncodedVectorInfo result; + std::memcpy(&result, input_buffer.data(), GetStoredSize()); + ARROW_CHECK(result.num_elements <= AlpConstants::kAlpVectorSize) + << "alp_compression_state_element_count_too_large: " << result.num_elements + << " vs " << AlpConstants::kAlpVectorSize; + + return result; +} + +uint64_t AlpEncodedVectorInfo::GetStoredSize() { return sizeof(AlpEncodedVectorInfo); } + +// ---------------------------------------------------------------------- +// AlpEncodedVector implementation + +template +void AlpEncodedVector::Store(arrow::util::span output_buffer) const { + const uint64_t overall_size = GetStoredSize(); + ARROW_CHECK(output_buffer.size() >= overall_size) + << "alp_bit_packed_vector_store_output_too_small: " << output_buffer.size() + << " vs " << overall_size; + vector_info.Store(output_buffer); + uint64_t compression_offset = AlpEncodedVectorInfo::GetStoredSize(); + + // Store all successfully compressed values first. + std::memcpy(output_buffer.data() + compression_offset, packed_values.data(), + vector_info.bit_packed_size); + compression_offset += vector_info.bit_packed_size; + + ARROW_CHECK(vector_info.num_exceptions == exceptions.size() && + vector_info.num_exceptions == exception_positions.size()) + << "alp_bit_packed_vector_store_num_exceptions_mismatch: " + << vector_info.num_exceptions << " vs " << exceptions.size() << " vs " + << exception_positions.size(); + + // Store exceptions, consisting of their positions and their values. + const uint64_t exception_position_size = + vector_info.num_exceptions * sizeof(AlpConstants::PositionType); + std::memcpy(output_buffer.data() + compression_offset, exception_positions.data(), + exception_position_size); + compression_offset += exception_position_size; + + const uint64_t exception_size = vector_info.num_exceptions * sizeof(T); + std::memcpy(output_buffer.data() + compression_offset, exceptions.data(), + exception_size); + compression_offset += exception_size; + + ARROW_CHECK(compression_offset == overall_size) + << "alp_bit_packed_vector_size_mismatch: " << compression_offset << " vs " + << overall_size; +} + +template +AlpEncodedVector AlpEncodedVector::Load( + arrow::util::span input_buffer) { + AlpEncodedVector result; + result.vector_info = AlpEncodedVectorInfo::Load(input_buffer); + uint64_t input_offset = AlpEncodedVectorInfo::GetStoredSize(); + + const uint64_t overall_size = GetStoredSize(result.vector_info); + + ARROW_CHECK(input_buffer.size() >= overall_size) + << "alp_compression_state_input_too_small: " << input_buffer.size() << " vs " + << overall_size; + ARROW_CHECK(result.vector_info.num_elements <= AlpConstants::kAlpVectorSize) + << "alp_compression_state_element_count_too_large: " + << result.vector_info.num_elements << " vs " << AlpConstants::kAlpVectorSize; + + // Optimization: Use UnsafeResize to avoid zero-initialization before memcpy. + // This is safe for POD types since we immediately overwrite with memcpy. + result.packed_values.UnsafeResize(result.vector_info.bit_packed_size); + std::memcpy(result.packed_values.data(), input_buffer.data() + input_offset, + result.vector_info.bit_packed_size); + input_offset += result.vector_info.bit_packed_size; + + result.exception_positions.UnsafeResize(result.vector_info.num_exceptions); + const uint64_t exception_position_size = + result.vector_info.num_exceptions * sizeof(AlpConstants::PositionType); + std::memcpy(result.exception_positions.data(), input_buffer.data() + input_offset, + exception_position_size); + input_offset += exception_position_size; + + result.exceptions.UnsafeResize(result.vector_info.num_exceptions); + const uint64_t exception_size = result.vector_info.num_exceptions * sizeof(T); + std::memcpy(result.exceptions.data(), input_buffer.data() + input_offset, + exception_size); + return result; +} + +template +uint64_t AlpEncodedVector::GetStoredSize() const { + return AlpEncodedVectorInfo::GetStoredSize() + vector_info.bit_packed_size + + vector_info.num_exceptions * (sizeof(AlpConstants::PositionType) + sizeof(T)); +} + +// ---------------------------------------------------------------------- +// AlpEncodedVectorView implementation + +template +AlpEncodedVectorView AlpEncodedVectorView::LoadView( + arrow::util::span input_buffer) { + AlpEncodedVectorView result; + result.vector_info = AlpEncodedVectorInfo::Load(input_buffer); + uint64_t input_offset = AlpEncodedVectorInfo::GetStoredSize(); + + const uint64_t overall_size = AlpEncodedVector::GetStoredSize(result.vector_info); + + ARROW_CHECK(input_buffer.size() >= overall_size) + << "alp_view_input_too_small: " << input_buffer.size() << " vs " << overall_size; + ARROW_CHECK(result.vector_info.num_elements <= AlpConstants::kAlpVectorSize) + << "alp_view_element_count_too_large: " << result.vector_info.num_elements + << " vs " << AlpConstants::kAlpVectorSize; + + // Create spans pointing directly into the input buffer (zero-copy) + result.packed_values = { + reinterpret_cast(input_buffer.data() + input_offset), + result.vector_info.bit_packed_size}; + input_offset += result.vector_info.bit_packed_size; + + const uint64_t exception_position_size = + result.vector_info.num_exceptions * sizeof(AlpConstants::PositionType); + result.exception_positions = { + reinterpret_cast(input_buffer.data() + input_offset), + result.vector_info.num_exceptions}; + input_offset += exception_position_size; + + result.exceptions = {reinterpret_cast(input_buffer.data() + input_offset), + result.vector_info.num_exceptions}; + + return result; +} + +template +uint64_t AlpEncodedVectorView::GetStoredSize() const { + return AlpEncodedVectorInfo::GetStoredSize() + vector_info.bit_packed_size + + vector_info.num_exceptions * (sizeof(AlpConstants::PositionType) + sizeof(T)); +} + +template struct AlpEncodedVectorView; +template struct AlpEncodedVectorView; + +template +uint64_t AlpEncodedVector::GetStoredSize(const AlpEncodedVectorInfo& info) { + return AlpEncodedVectorInfo::GetStoredSize() + info.bit_packed_size + + info.num_exceptions * (sizeof(AlpConstants::PositionType) + sizeof(T)); +} + +template +bool AlpEncodedVector::operator==(const AlpEncodedVector& other) const { + // Manual comparison since StaticVector doesn't have operator==. + const bool packed_values_equal = + (packed_values.size() == other.packed_values.size()) && + std::equal(packed_values.begin(), packed_values.end(), + other.packed_values.begin()); + const bool exceptions_equal = + (exceptions.size() == other.exceptions.size()) && + std::equal(exceptions.begin(), exceptions.end(), other.exceptions.begin()); + const bool exception_positions_equal = + (exception_positions.size() == other.exception_positions.size()) && + std::equal(exception_positions.begin(), exception_positions.end(), + other.exception_positions.begin()); + return vector_info == other.vector_info && packed_values_equal && exceptions_equal && + exception_positions_equal; +} + +template class AlpEncodedVector; +template class AlpEncodedVector; + +// ---------------------------------------------------------------------- +// Internal helper classes + +namespace { + +/// \brief Helper class for encoding/decoding individual values +template +class AlpInlines : private AlpConstants { + public: + using Constants = AlpTypedConstants; + using ExactType = typename Constants::FloatingToExact; + using SignedExactType = typename Constants::FloatingToSignedExact; + + /// \brief Check if float is a special value that cannot be converted + static inline bool IsImpossibleToEncode(const T n) { + // We do not have to check for positive or negative infinity, since + // std::numeric_limits::infinity() > std::numeric_limits::max() + // and vice versa for negative infinity. + return std::isnan(n) || n > Constants::kEncodingUpperLimit || + n < Constants::kEncodingLowerLimit || + (n == 0.0 && std::signbit(n)); // Verification for -0.0 + } + + /// \brief Convert a float to an int without rounding + static inline auto FastRound(T n) -> SignedExactType { + n = n + Constants::kMagicNumber - Constants::kMagicNumber; + return static_cast(n); + } + + /// \brief Fast way to round float to nearest integer + static inline auto NumberToInt(T n) -> SignedExactType { + if (IsImpossibleToEncode(n)) { + return static_cast(Constants::kEncodingUpperLimit); + } + return FastRound(n); + } + + /// \brief Convert a float into an int using encoding options + static inline SignedExactType EncodeValue( + const T value, const AlpExponentAndFactor exponent_and_factor) { + const T tmp_encoded_value = value * + Constants::GetExponent(exponent_and_factor.exponent) * + Constants::GetFactor(exponent_and_factor.factor); + return NumberToInt(tmp_encoded_value); + } + + /// \brief Reconvert an int to a float using encoding options + static inline T DecodeValue(const SignedExactType encoded_value, + const AlpExponentAndFactor exponent_and_factor) { + // The cast to T is needed to prevent a signed integer overflow. + return static_cast(encoded_value) * GetFactor(exponent_and_factor.factor) * + Constants::GetFactor(exponent_and_factor.exponent); + } +}; + +/// \brief Helper struct for tracking compression combinations +struct AlpCombination { + AlpExponentAndFactor exponent_and_factor; + uint64_t num_appearances{0}; + uint64_t estimated_compression_size{0}; +}; + +/// \brief Compare two ALP combinations to determine which is better +/// +/// Return true if c1 is a better combination than c2. +/// First criteria is number of times it appears as best combination. +/// Second criteria is the estimated compression size. +/// Third criteria is bigger exponent. +/// Fourth criteria is bigger factor. +bool CompareAlpCombinations(const AlpCombination& c1, const AlpCombination& c2) { + return (c1.num_appearances > c2.num_appearances) || + (c1.num_appearances == c2.num_appearances && + (c1.estimated_compression_size < c2.estimated_compression_size)) || + ((c1.num_appearances == c2.num_appearances && + c1.estimated_compression_size == c2.estimated_compression_size) && + (c2.exponent_and_factor.exponent < c1.exponent_and_factor.exponent)) || + ((c1.num_appearances == c2.num_appearances && + c1.estimated_compression_size == c2.estimated_compression_size && + c2.exponent_and_factor.exponent == c1.exponent_and_factor.exponent) && + (c2.exponent_and_factor.factor < c1.exponent_and_factor.factor)); +} + +} // namespace + +// ---------------------------------------------------------------------- +// AlpCompression implementation + +template +std::optional AlpCompression::EstimateCompressedSize( + const std::vector& input_vector, + const AlpExponentAndFactor exponent_and_factor, + const bool penalize_exceptions) { + // Dry compress a vector (ideally a sample) to estimate ALP compression size + // given an exponent and factor. + SignedExactType max_encoded_value = std::numeric_limits::min(); + SignedExactType min_encoded_value = std::numeric_limits::max(); + + uint64_t num_exceptions = 0; + uint64_t num_non_exceptions = 0; + for (const T& value : input_vector) { + const SignedExactType encoded_value = + AlpInlines::EncodeValue(value, exponent_and_factor); + T decoded_value = AlpInlines::DecodeValue(encoded_value, exponent_and_factor); + if (decoded_value == value) { + num_non_exceptions++; + max_encoded_value = std::max(encoded_value, max_encoded_value); + min_encoded_value = std::min(encoded_value, min_encoded_value); + continue; + } + num_exceptions++; + } + + // We penalize combinations which yield almost all exceptions. + if (penalize_exceptions && num_non_exceptions < 2) { + return std::nullopt; + } + + // Evaluate factor/exponent compression size (we optimize for FOR). + const ExactType delta = (static_cast(max_encoded_value) - + static_cast(min_encoded_value)); + + const uint32_t estimated_bits_per_value = + static_cast(std::ceil(std::log2(delta + 1))); + uint64_t estimated_compression_size = input_vector.size() * estimated_bits_per_value; + estimated_compression_size += + num_exceptions * (kExactTypeBitSize + (sizeof(PositionType) * 8)); + return estimated_compression_size; +} + +template +AlpEncodingPreset AlpCompression::CreateEncodingPreset( + const std::vector>& vectors_sampled) { + // Find the best combinations of factor-exponent from each sampled vector. + // This function is called once per segment. + // This operates over ALP first level samples. + static constexpr uint64_t kMaxCombinationCount = + (Constants::kMaxExponent + 1) * (Constants::kMaxExponent + 2) / 2; + + std::map best_k_combinations_hash; + + uint64_t best_compressed_size_bits = std::numeric_limits::max(); + // For each vector sampled. + for (const std::vector& sampled_vector : vectors_sampled) { + const uint64_t num_samples = sampled_vector.size(); + const AlpExponentAndFactor best_encoding_options{Constants::kMaxExponent, + Constants::kMaxExponent}; + + // Start optimization with worst possible total bits from compression. + const uint64_t best_total_bits = + (num_samples * (kExactTypeBitSize + sizeof(PositionType) * 8)) + + (num_samples * kExactTypeBitSize); + + // N of appearances is irrelevant at this phase; we search for best compression. + AlpCombination best_combination{best_encoding_options, 0, best_total_bits}; + // Try all combinations to find the one which minimizes compression size. + for (uint8_t exp_idx = 0; exp_idx <= Constants::kMaxExponent; exp_idx++) { + for (uint8_t factor_idx = 0; factor_idx <= exp_idx; factor_idx++) { + const AlpExponentAndFactor current_exponent_and_factor{exp_idx, factor_idx}; + std::optional estimated_compression_size = EstimateCompressedSize( + sampled_vector, current_exponent_and_factor, /*penalize_exceptions=*/true); + + // Skip comparison for values that are not compressible. + if (!estimated_compression_size.has_value()) { + continue; + } + + const AlpCombination current_combination{current_exponent_and_factor, 0, + *estimated_compression_size}; + if (CompareAlpCombinations(current_combination, best_combination)) { + best_combination = current_combination; + best_compressed_size_bits = + std::min(best_compressed_size_bits, *estimated_compression_size); + } + } + } + best_k_combinations_hash[best_combination.exponent_and_factor]++; + } + + // Convert our hash to a Combination vector to be able to sort. + // Note that this vector should mostly be small (< 10 combinations). + std::vector best_k_combinations; + best_k_combinations.reserve( + std::min(best_k_combinations_hash.size(), kMaxCombinationCount)); + for (const auto& combination : best_k_combinations_hash) { + best_k_combinations.emplace_back(AlpCombination{ + combination.first, // Encoding Indices + combination.second, // N of times it appeared (hash value) + 0 // Compression size is irrelevant since we compare different vectors. + }); + } + std::sort(best_k_combinations.begin(), best_k_combinations.end(), + CompareAlpCombinations); + + std::vector combinations; + // Save k' best combinations. + for (uint64_t i = 0; + i < std::min(kMaxCombinations, static_cast(best_k_combinations.size())); + i++) { + combinations.push_back(best_k_combinations[i].exponent_and_factor); + } + + const uint64_t best_compressed_size_bytes = + std::ceil(best_compressed_size_bits / 8.0); + return {combinations, best_compressed_size_bytes}; +} + +template +std::vector AlpCompression::CreateSample(arrow::util::span input) { + // Sample equidistant values within a vector; skip a fixed number of values. + const auto idx_increments = std::max( + 1, static_cast(std::ceil(static_cast(input.size()) / + AlpConstants::kSamplerSamplesPerVector))); + std::vector vector_sample; + vector_sample.reserve(std::ceil(input.size() / static_cast(idx_increments))); + for (uint64_t i = 0; i < input.size(); i += idx_increments) { + vector_sample.push_back(input[i]); + } + return vector_sample; +} + +template +AlpExponentAndFactor AlpCompression::FindBestExponentAndFactor( + arrow::util::span input, + const std::vector& combinations) { + // Find the best factor-exponent combination from within the best k combinations. + // This is ALP second level sampling. + if (combinations.size() == 1) { + return combinations.front(); + } + + const std::vector sample_vector = CreateSample(input); + + AlpExponentAndFactor best_exponent_and_factor; + uint64_t best_total_bits = std::numeric_limits::max(); + uint64_t worse_total_bits_counter = 0; + + // Try each K combination to find the one which minimizes compression size. + for (const AlpExponentAndFactor& exponent_and_factor : combinations) { + std::optional estimated_compression_size = EstimateCompressedSize( + sample_vector, exponent_and_factor, /*penalize_exceptions=*/false); + + // Skip exponents and factors which result in many exceptions. + if (!estimated_compression_size.has_value()) { + continue; + } + + // If current compression size is worse or equal than current best combination. + if (estimated_compression_size >= best_total_bits) { + worse_total_bits_counter += 1; + // Early exit strategy. + if (worse_total_bits_counter == kSamplingEarlyExitThreshold) { + break; + } + continue; + } + // Otherwise replace the best and continue trying with next combination. + best_total_bits = estimated_compression_size.value(); + best_exponent_and_factor = exponent_and_factor; + worse_total_bits_counter = 0; + } + return best_exponent_and_factor; +} + +template +auto AlpCompression::EncodeVector(arrow::util::span input_vector, + AlpExponentAndFactor exponent_and_factor) + -> EncodingResult { + arrow::internal::StaticVector encoded_integers; + arrow::internal::StaticVector exceptions; + arrow::internal::StaticVector exception_positions; + + // Encoding Float/Double to SignedExactType(Int32, Int64). + // Encode all values regardless of correctness to recover original floating-point. + uint64_t input_offset = 0; + for (const T input : input_vector) { + const SignedExactType encoded_value = + AlpInlines::EncodeValue(input, exponent_and_factor); + const T decoded_value = AlpInlines::DecodeValue(encoded_value, exponent_and_factor); + encoded_integers.push_back(encoded_value); + // Detect exceptions using a predicated comparison. + if (decoded_value != input) { + exception_positions.push_back(input_offset); + } + input_offset++; + } + + // Finding first non-exception value. + SignedExactType first_non_exception_value = 0; + PositionType exception_offset = 0; + for (const PositionType exception_position : exception_positions) { + if (exception_offset != exception_position) { + first_non_exception_value = encoded_integers[exception_offset]; + break; + } + exception_offset++; + } + + // Use first non-exception value as placeholder for all exception values. + for (const PositionType exception_position : exception_positions) { + const T actual_value = input_vector[exception_position]; + encoded_integers[exception_position] = first_non_exception_value; + exceptions.push_back(actual_value); + } + + // Analyze FOR. + const auto [min, max] = + std::minmax_element(encoded_integers.begin(), encoded_integers.end()); + const auto frame_of_reference = static_cast(*min); + + for (SignedExactType& encoded_integer : encoded_integers) { + ExactType& u_encoded_integer = *reinterpret_cast(&encoded_integer); + u_encoded_integer -= frame_of_reference; + } + + const ExactType min_max_diff = + (static_cast(*max) - static_cast(*min)); + return EncodingResult{encoded_integers, exceptions, exception_positions, min_max_diff, + frame_of_reference}; +} + +template +auto AlpCompression::BitPackIntegers( + arrow::util::span integers, const uint64_t min_max_diff) + -> BitPackingResult { + uint8_t bit_width = 0; + + if (min_max_diff == 0) { + bit_width = 0; + } else if constexpr (std::is_same_v) { + bit_width = sizeof(T) * 8 - __builtin_clz(min_max_diff); + } else if constexpr (std::is_same_v) { + bit_width = sizeof(T) * 8 - __builtin_clzll(min_max_diff); + } + const uint64_t bit_packed_size = std::ceil((bit_width * integers.size()) / 8.0); + + arrow::internal::StaticVector packed_integers; + // Use unsafe resize to avoid zero-initialization. Zero initialization was + // resulting in around 2-3% degradation in compression speed. + packed_integers.UnsafeResize(bit_packed_size); + if (bit_width > 0) { // Only execute BP if writing data. + // Use Arrow's BitWriter for packing (loop-based). + arrow::bit_util::BitWriter writer(packed_integers.data(), + static_cast(bit_packed_size)); + for (uint64_t i = 0; i < integers.size(); ++i) { + writer.PutValue(static_cast(integers[i]), bit_width); + } + writer.Flush(false); + } + return {packed_integers, bit_width, bit_packed_size}; +} + +template +AlpEncodedVector AlpCompression::CompressVector(const T* input_vector, + uint16_t num_elements, + const AlpEncodingPreset& preset) { + // Compress by finding a fitting exponent/factor, encode input, and bitpack. + const arrow::util::span input_span{input_vector, num_elements}; + const AlpExponentAndFactor exponent_and_factor = + FindBestExponentAndFactor(input_span, preset.combinations); + const EncodingResult encoding_result = EncodeVector(input_span, exponent_and_factor); + BitPackingResult bitpacking_result; + switch (preset.bit_pack_layout) { + case AlpBitPackLayout::kNormal: + bitpacking_result = + BitPackIntegers(encoding_result.encoded_integers, encoding_result.min_max_diff); + break; + default: + ARROW_CHECK(false) << "invalid_bit_pack_layout: " + << static_cast(preset.bit_pack_layout); + break; + } + + // Transfer compressed data into a serializable format. + const AlpEncodedVectorInfo vector_info{ + exponent_and_factor, + encoding_result.frame_of_reference, + bitpacking_result.bit_width, + bitpacking_result.bit_packed_size, + num_elements, + static_cast(encoding_result.exceptions.size())}; + + return AlpEncodedVector{vector_info, bitpacking_result.packed_integers, + encoding_result.exceptions, + encoding_result.exception_positions}; +} + +template +auto AlpCompression::BitUnpackIntegers( + arrow::util::span packed_integers, + const AlpEncodedVectorInfo vector_info) + -> arrow::internal::StaticVector { + arrow::internal::StaticVector encoded_integers; + // Optimization: Use UnsafeResize to avoid zero-initialization. + // Safe because we immediately write to all elements via unpack(). + encoded_integers.UnsafeResize(vector_info.num_elements); + + if (vector_info.bit_width > 0) { + // Arrow's SIMD unpack works in fixed batch sizes. All SIMD implementations + // (SIMD128/NEON, SIMD256/AVX2, SIMD512/AVX512) have identical batch sizes: + // - uint32_t (float): Simd*UnpackerForWidth::kValuesUnpacked = 32 + // - uint64_t (double): Simd*UnpackerForWidth::kValuesUnpacked = 64 + // These constants are in anonymous namespaces (internal implementation detail), + // so we hardcode them here. + constexpr int kMinBatchSize = std::is_same_v ? 32 : 64; + const int num_elements = static_cast(vector_info.num_elements); + const int num_complete_batches = num_elements / kMinBatchSize; + const int num_complete_elements = num_complete_batches * kMinBatchSize; + + // Use Arrow's SIMD-optimized unpack for complete batches. + if (num_complete_elements > 0) { + arrow::internal::unpack(packed_integers.data(), encoded_integers.data(), + num_complete_elements, vector_info.bit_width); + } + + // Handle remaining elements (<64) with BitReader to match BitWriter format. + const int remaining = num_elements - num_complete_elements; + if (remaining > 0) { + // Calculate byte offset where SIMD unpack finished + const uint64_t bits_consumed_by_simd = + static_cast(num_complete_elements) * vector_info.bit_width; + // Round up to next byte + const uint64_t bytes_consumed_by_simd = (bits_consumed_by_simd + 7) / 8; + + // Use BitReader for remaining elements starting from where SIMD left off + arrow::bit_util::BitReader reader( + packed_integers.data() + bytes_consumed_by_simd, + static_cast(packed_integers.size() - bytes_consumed_by_simd)); + + for (int i = 0; i < remaining; ++i) { + uint64_t value = 0; + if (reader.GetValue(vector_info.bit_width, &value)) { + encoded_integers[num_complete_elements + i] = static_cast(value); + } else { + encoded_integers[num_complete_elements + i] = 0; + } + } + } + } else { + std::memset(encoded_integers.data(), 0, vector_info.num_elements * sizeof(ExactType)); + } + return encoded_integers; +} + +template +template +void AlpCompression::DecodeVector(TargetType* output_vector, + arrow::util::span input_vector, + const AlpEncodedVectorInfo vector_info) { + // Fused unFOR + decode loop - reduces memory traffic by avoiding + // intermediate write-then-read of the unFOR'd values. + const size_t num_elements = input_vector.size(); + const ExactType* data = input_vector.data(); + const ExactType frame_of_ref = vector_info.frame_of_reference; + +#pragma GCC unroll AlpConstants::kLoopUnrolls +#pragma GCC ivdep + for (size_t i = 0; i < num_elements; ++i) { + // 1. Apply frame of reference (unFOR) - unsigned arithmetic + const ExactType unfored_value = data[i] + frame_of_ref; + // 2. Reinterpret as signed integer for decode + SignedExactType signed_value; + std::memcpy(&signed_value, &unfored_value, sizeof(SignedExactType)); + // 3. Decode using original function to preserve exact floating-point behavior + output_vector[i] = + AlpInlines::DecodeValue(signed_value, vector_info.exponent_and_factor); + } +} + +template +template +void AlpCompression::PatchExceptions( + TargetType* output, arrow::util::span exceptions, + arrow::util::span exception_positions) { + // Exceptions Patching. + uint64_t exception_idx = 0; +#pragma GCC unroll AlpConstants::kLoopUnrolls +#pragma GCC ivdep + for (uint16_t const exception_position : exception_positions) { + output[exception_position] = static_cast(exceptions[exception_idx]); + exception_idx++; + } +} + +template +template +void AlpCompression::DecompressVector(const AlpEncodedVector& packed_vector, + const AlpBitPackLayout bit_pack_layout, + TargetType* output) { + static_assert(sizeof(T) <= sizeof(TargetType)); + const AlpEncodedVectorInfo& vector_info = packed_vector.vector_info; + + switch (bit_pack_layout) { + case AlpBitPackLayout::kNormal: { + arrow::internal::StaticVector encoded_integers = + BitUnpackIntegers(packed_vector.packed_values, vector_info); + DecodeVector(output, {encoded_integers.data(), vector_info.num_elements}, + vector_info); + PatchExceptions(output, packed_vector.exceptions, + packed_vector.exception_positions); + } break; + default: + ARROW_CHECK(false) << "invalid_bit_pack_layout: " + << static_cast(bit_pack_layout); + break; + } +} + +template +template +void AlpCompression::DecompressVectorView(const AlpEncodedVectorView& encoded_view, + const AlpBitPackLayout bit_pack_layout, + TargetType* output) { + static_assert(sizeof(T) <= sizeof(TargetType)); + const AlpEncodedVectorInfo& vector_info = encoded_view.vector_info; + + switch (bit_pack_layout) { + case AlpBitPackLayout::kNormal: { + // Use the view's spans directly - no copy needed + arrow::internal::StaticVector encoded_integers = + BitUnpackIntegers(encoded_view.packed_values, vector_info); + DecodeVector(output, {encoded_integers.data(), vector_info.num_elements}, + vector_info); + PatchExceptions(output, encoded_view.exceptions, + encoded_view.exception_positions); + } break; + default: + ARROW_CHECK(false) << "invalid_bit_pack_layout: " + << static_cast(bit_pack_layout); + break; + } +} + +// ---------------------------------------------------------------------- +// Template instantiations + +template void AlpCompression::DecompressVector( + const AlpEncodedVector& packed_vector, AlpBitPackLayout bit_pack_layout, + double* output); +template void AlpCompression::DecompressVector( + const AlpEncodedVector& packed_vector, AlpBitPackLayout bit_pack_layout, + float* output); +template void AlpCompression::DecompressVector( + const AlpEncodedVector& packed_vector, AlpBitPackLayout bit_pack_layout, + double* output); + +template void AlpCompression::DecompressVectorView( + const AlpEncodedVectorView& encoded_view, AlpBitPackLayout bit_pack_layout, + double* output); +template void AlpCompression::DecompressVectorView( + const AlpEncodedVectorView& encoded_view, AlpBitPackLayout bit_pack_layout, + float* output); +template void AlpCompression::DecompressVectorView( + const AlpEncodedVectorView& encoded_view, AlpBitPackLayout bit_pack_layout, + double* output); + +template class AlpCompression; +template class AlpCompression; + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/alp/Alp.h b/cpp/src/arrow/util/alp/Alp.h new file mode 100644 index 00000000000..e73ba5b6bac --- /dev/null +++ b/cpp/src/arrow/util/alp/Alp.h @@ -0,0 +1,529 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Adaptive Lossless floating-Point (ALP) compression implementation + +#pragma once + +#include + +#include "arrow/util/alp/AlpConstants.h" +#include "arrow/util/small_vector.h" +#include "arrow/util/span.h" + +namespace arrow { +namespace util { +namespace alp { + +// ---------------------------------------------------------------------- +// ALP Overview +// +// IMPORTANT: For abstract interfaces or examples how to use ALP, consult +// AlpWrapper.h. +// This is our implementation of the adaptive lossless floating-point +// compression for decimals (ALP) (https://dl.acm.org/doi/10.1145/3626717). +// It works by converting a float into a decimal (if possible). The exponent +// and factor are chosen per vector. Each float is converted using +// c(f) = int64(f * 10^exponent * 10^-factor). The converted floats are then +// encoded via a delta frame of reference and bitpacked. Every exception, +// where the conversion/reconversion changes the value of the float, is stored +// separately and has to be patched into the decompressed vector afterwards. +// +// ========================================================================== +// ALP COMPRESSION/DECOMPRESSION PIPELINE +// ========================================================================== +// +// COMPRESSION FLOW: +// ----------------- +// +// Input: float/double array +// | +// v +// +------------------------------------------------------------------+ +// | 1. SAMPLING & PRESET GENERATION | +// | * Sample vectors from dataset | +// | * Try all exponent/factor combinations (e, f) | +// | * Select best k combinations for preset | +// +------------------------------------+-----------------------------+ +// | preset.combinations +// v +// +------------------------------------------------------------------+ +// | 2. PER-VECTOR COMPRESSION | +// | a) Find best (e,f) from preset for this vector | +// | b) Encode: encoded[i] = int64(value[i] * 10^e * 10^-f) | +// | c) Verify: if decode(encoded[i]) != value[i] -> exception | +// | d) Replace exceptions with placeholder value | +// +------------------------------------+-----------------------------+ +// | encoded integers + exceptions +// v +// +------------------------------------------------------------------+ +// | 3. FRAME OF REFERENCE (FOR) | +// | * Find min value in encoded integers | +// | * Subtract min from all values: delta[i] = encoded[i] - min | +// +------------------------------------+-----------------------------+ +// | delta values (smaller range) +// v +// +------------------------------------------------------------------+ +// | 4. BIT PACKING | +// | * Calculate bit_width = log2(max_delta) | +// | * Pack each value into bit_width bits | +// | * Result: tightly packed binary data | +// +------------------------------------+-----------------------------+ +// | packed bytes +// v +// +------------------------------------------------------------------+ +// | 5. SERIALIZATION (see AlpEncodedVector diagram below) | +// | [VectorInfo][PackedData][ExceptionPos][ExceptionValues] | +// +------------------------------------------------------------------+ +// +// +// DECOMPRESSION FLOW: +// ------------------- +// +// Serialized bytes -> AlpEncodedVector::Load() +// | +// v +// +------------------------------------------------------------------+ +// | 1. BIT UNPACKING | +// | * Extract bit_width from metadata | +// | * Unpack each value from bit_width bits -> delta values | +// +------------------------------------+-----------------------------+ +// | delta values +// v +// +------------------------------------------------------------------+ +// | 2. REVERSE FRAME OF REFERENCE (unFOR) | +// | * Add back min: encoded[i] = delta[i] + frame_of_reference | +// +------------------------------------+-----------------------------+ +// | encoded integers +// v +// +------------------------------------------------------------------+ +// | 3. DECODE | +// | * Apply inverse formula: value[i] = encoded[i] * 10^-e * 10^f | +// +------------------------------------+-----------------------------+ +// | decoded floats (with placeholders) +// v +// +------------------------------------------------------------------+ +// | 4. PATCH EXCEPTIONS | +// | * Replace values at exception_positions[] with exceptions[] | +// +------------------------------------+-----------------------------+ +// | +// v +// Output: Original float/double array (lossless!) +// +// ========================================================================== + +// ---------------------------------------------------------------------- +// AlpMode + +/// \brief ALP compression mode +/// +/// Currently only ALP (decimal compression) is implemented. +enum class AlpMode { kAlp }; + +// ---------------------------------------------------------------------- +// AlpExponentAndFactor + +/// \brief Helper struct to encapsulate the exponent and factor +struct AlpExponentAndFactor { + uint8_t exponent{0}; + uint8_t factor{0}; + + bool operator==(const AlpExponentAndFactor& other) const { + return exponent == other.exponent && factor == other.factor; + } + + /// \brief Comparison operator for deterministic std::map ordering + bool operator<(const AlpExponentAndFactor& other) const { + if (exponent != other.exponent) return exponent < other.exponent; + return factor < other.factor; + } +}; + +// ---------------------------------------------------------------------- +// AlpEncodedVectorInfo + +/// \brief Metadata for an encoded vector +/// +/// Helper class to encapsulate all metadata of an encoded vector to be able +/// to load and decompress it. +/// +/// Serialization format (stored as raw binary struct): +/// +/// +------------------------------------------+ +/// | AlpEncodedVectorInfo (23+ bytes) | +/// +------------------------------------------+ +/// | Offset | Field | Size | +/// +---------+---------------------+----------+ +/// | 0 | exponent (uint8_t) | 1 byte | +/// | 1 | factor (uint8_t) | 1 byte | +/// | 2 | [padding] | 6 bytes | +/// | 8 | frame_of_reference | 8 bytes | +/// | 16 | bit_width (uint8_t)| 1 byte | +/// | 17 | [padding] | 7 bytes | +/// | 24 | bit_packed_size | 8 bytes | +/// | 32 | num_elements | 2 bytes | +/// | 34 | num_exceptions | 2 bytes | +/// +------------------------------------------+ +struct AlpEncodedVectorInfo { + /// Exponent and factor used for compression + AlpExponentAndFactor exponent_and_factor; + /// Delta used for frame of reference encoding + uint64_t frame_of_reference = 0; + /// Bitwidth used for bitpacking + uint8_t bit_width = 0; + /// Overall bitpacked size of non-exception values + uint64_t bit_packed_size = 0; + /// Number of elements encoded in this vector + uint16_t num_elements = 0; + /// Number of exceptions stored in this vector + uint16_t num_exceptions = 0; + + /// \brief Store the compressed vector in a compact format into an output buffer + /// + /// \param[out] output_buffer the buffer to store the compressed data into + void Store(arrow::util::span output_buffer) const; + + /// \brief Load a compressed vector into the state from a compact format + /// + /// \param[in] input_buffer the buffer to load from + /// \return the loaded AlpEncodedVectorInfo + static AlpEncodedVectorInfo Load(arrow::util::span input_buffer); + + /// \brief Get serialized size of the encoded vector info + /// + /// \return the size in bytes + static uint64_t GetStoredSize(); + + bool operator==(const AlpEncodedVectorInfo& other) const; +}; + +// ---------------------------------------------------------------------- +// AlpEncodedVector + +/// \class AlpEncodedVector +/// \brief A compressed ALP vector with metadata +/// +/// Complete serialization format for an ALP compressed vector: +/// +/// +------------------------------------------------------------+ +/// | AlpEncodedVector Serialized Layout | +/// +------------------------------------------------------------+ +/// | Section | Size (bytes) | Description | +/// +-----------------------+----------------------+-------------+ +/// | 1. VectorInfo | sizeof(VectorInfo) | Metadata | +/// | (see above) | (~36 with padding) | | +/// +-----------------------+----------------------+-------------+ +/// | 2. Packed Values | bit_packed_size | Bitpacked | +/// | (compressed data) | (variable) | integers | +/// +-----------------------+----------------------+-------------+ +/// | 3. Exception Pos | num_exceptions * 2 | uint16_t[] | +/// | (indices) | (variable) | positions | +/// +-----------------------+----------------------+-------------+ +/// | 4. Exception Values | num_exceptions * | T[] (float/| +/// | (original floats) | sizeof(T) | double) | +/// +------------------------------------------------------------+ +/// +/// Example for 1024 floats with 5 exceptions and bit_width=8: +/// - VectorInfo: 36 bytes +/// - Packed Values: 1024 bytes (1024 * 8 bits / 8) +/// - Exception Pos: 10 bytes (5 * 2) +/// - Exception Values: 20 bytes (5 * 4) +/// Total: 1090 bytes +template +class AlpEncodedVector { + public: + /// Metadata of the encoded vector + AlpEncodedVectorInfo vector_info; + /// Successfully encoded and bitpacked data + arrow::internal::StaticVector + packed_values; + /// Float values that could not be converted successfully + arrow::internal::StaticVector exceptions; + /// Positions of the exceptions in the decompressed vector + arrow::internal::StaticVector exception_positions; + + /// \brief Get the size of the vector if stored into a sequential memory block + /// + /// \return the stored size in bytes + uint64_t GetStoredSize() const; + + /// \brief Get the stored size for a given vector info + /// + /// \param[in] info the vector info to calculate size for + /// \return the stored size in bytes + static uint64_t GetStoredSize(const AlpEncodedVectorInfo& info); + + /// \brief Get the number of elements in this vector + /// + /// \return number of elements + uint64_t GetNumElements() const { return vector_info.num_elements; } + + /// \brief Store the compressed vector in a compact format into an output buffer + /// + /// \param[out] output_buffer the buffer to store the compressed data into + void Store(arrow::util::span output_buffer) const; + + /// \brief Load a compressed vector from a compact format from an input buffer + /// + /// \param[in] input_buffer the buffer to load from + /// \return the loaded AlpEncodedVector + static AlpEncodedVector Load(arrow::util::span input_buffer); + + bool operator==(const AlpEncodedVector& other) const; +}; + +// ---------------------------------------------------------------------- +// AlpEncodedVectorView + +/// \class AlpEncodedVectorView +/// \brief A zero-copy view into compressed ALP data +/// +/// Unlike AlpEncodedVector which copies data into internal buffers, +/// AlpEncodedVectorView holds spans that point directly to the compressed +/// data buffer. This avoids memory copies during decompression. +/// +/// Use LoadView() to create a view, then pass to DecompressVectorView(). +/// The underlying buffer must remain valid for the lifetime of the view. +template +struct AlpEncodedVectorView { + /// Metadata of the encoded vector (copied, small fixed size) + AlpEncodedVectorInfo vector_info; + /// View into bitpacked data (no copy) + arrow::util::span packed_values; + /// View into exception values (no copy) + arrow::util::span exceptions; + /// View into exception positions (no copy) + arrow::util::span exception_positions; + + /// \brief Create a zero-copy view from a compact format input buffer + /// + /// \param[in] input_buffer the buffer to create a view into + /// \return the view into the compressed data + static AlpEncodedVectorView LoadView(arrow::util::span input_buffer); + + /// \brief Get the stored size of this vector in the buffer + /// + /// \return the stored size in bytes + uint64_t GetStoredSize() const; +}; + +// ---------------------------------------------------------------------- +// AlpBitPackLayout + +/// \brief Bit packing layout +/// +/// Currently only normal bit packing is implemented. +enum class AlpBitPackLayout { kNormal }; + +// ---------------------------------------------------------------------- +// AlpEncodingPreset + +/// \brief Preset for ALP compression +/// +/// Helper struct for compression. Before a larger amount of data is compressed, +/// a preset is generated, which contains multiple combinations of exponents and +/// factors. For each vector that is compressed, one of the combinations of this +/// preset is chosen dynamically. +struct AlpEncodingPreset { + /// Combinations of exponents and factors + std::vector combinations; + /// Best compressed size for the preset + uint64_t best_compressed_size = 0; + /// Bit packing layout used for bitpacking + AlpBitPackLayout bit_pack_layout = AlpBitPackLayout::kNormal; +}; + +template +class AlpSampler; + +// ---------------------------------------------------------------------- +// AlpCompression + +/// \class AlpCompression +/// \brief ALP compression and decompression facilities +/// +/// AlpCompression contains all facilities to compress and decompress data with +/// ALP in a vectorized fashion. Use CreateEncodingPreset() first on a sample of +/// the input data, then compress it vector-wise via CompressVector(). To +/// serialize the data, use the facilities provided by AlpEncodedVector. +/// +/// \tparam T the type of data to be compressed. Currently float and double. +template +class AlpCompression : private AlpConstants { + public: + using Constants = AlpTypedConstants; + using ExactType = typename Constants::FloatingToExact; + using SignedExactType = typename Constants::FloatingToSignedExact; + static constexpr uint8_t kExactTypeBitSize = sizeof(T) * 8; + + /// \brief Compress a vector of floating point values via ALP + /// + /// \param[in] input_vector a vector of floats containing input to compress + /// \param[in] num_elements the number of values to be compressed + /// \param[in] preset the preset to be used for compression + /// \return an ALP encoded vector + static AlpEncodedVector CompressVector(const T* input_vector, + uint16_t num_elements, + const AlpEncodingPreset& preset); + + /// \brief Decompress a compressed vector with ALP + /// + /// \param[in] encoded_vector the ALP encoded vector to be decompressed + /// \param[in] bit_pack_layout the bit packing layout used + /// \param[out] output_vector the vector of floats to decompress into. + /// Must be able to contain encoded_vector.GetNumElements(). + /// \tparam TargetType the type that is used to store the output. + /// May not be a narrowing conversion from T. + template + static void DecompressVector(const AlpEncodedVector& encoded_vector, + AlpBitPackLayout bit_pack_layout, + TargetType* output_vector); + + /// \brief Decompress using a zero-copy view (faster, no memory allocation) + /// + /// \param[in] encoded_view the zero-copy view into compressed data + /// \param[in] bit_pack_layout the bit packing layout used + /// \param[out] output_vector the vector of floats to decompress into. + /// Must be able to contain encoded_view.vector_info.num_elements. + /// \tparam TargetType the type that is used to store the output. + /// May not be a narrowing conversion from T. + template + static void DecompressVectorView(const AlpEncodedVectorView& encoded_view, + AlpBitPackLayout bit_pack_layout, + TargetType* output_vector); + + protected: + /// \brief Creates an EncodingPreset consisting of multiple factors/exponents + /// + /// \param[in] vectors_sampled the sampled vectors to derive combinations from + /// \return the EncodingPreset + static AlpEncodingPreset CreateEncodingPreset( + const std::vector>& vectors_sampled); + friend AlpSampler; + + private: + /// \brief Create a subsample of floats from an input vector for preset gen + /// + /// \param[in] input the input vector to sample from + /// \return a vector containing a representative subsample of input values + static std::vector CreateSample(arrow::util::span input); + + /// \brief Perform a dry-compression to estimate the compressed size + /// + /// \param[in] input_vector the input vector to estimate compression for + /// \param[in] exponent_and_factor the exponent/factor combination to evaluate + /// \param[in] penalize_exceptions if true, applies a penalty for exceptions + /// \return the estimated compressed size in bytes, or std::nullopt if the + /// data is not compressible using these settings + static std::optional EstimateCompressedSize( + const std::vector& input_vector, + AlpExponentAndFactor exponent_and_factor, + bool penalize_exceptions); + + /// \brief Find the best exponent and factor combination for an input vector + /// + /// Iterates through all combinations in the preset and selects the one + /// that produces the smallest compressed size. + /// + /// \param[in] input the input vector to find the best combination for + /// \param[in] combinations candidate exponent/factor combinations from preset + /// \return the exponent and factor combination yielding best compression + static AlpExponentAndFactor FindBestExponentAndFactor( + arrow::util::span input, + const std::vector& combinations); + + /// \brief Helper struct to encapsulate the result from EncodeVector() + struct EncodingResult { + arrow::internal::StaticVector + encoded_integers; + arrow::internal::StaticVector exceptions; + arrow::internal::StaticVector + exception_positions; + ExactType min_max_diff = 0; + ExactType frame_of_reference = 0; + }; + + /// \brief Encode a vector via decimal encoding and frame of reference (FOR) + /// + /// \param[in] input_vector the input vector of floating point values + /// \param[in] exponent_and_factor the exponent/factor for decimal encoding + /// \return an EncodingResult containing encoded integers, exceptions, etc. + static EncodingResult EncodeVector(arrow::util::span input_vector, + AlpExponentAndFactor exponent_and_factor); + + /// \brief Decode a vector of integers back to floating point values + /// + /// \param[out] output_vector output buffer to write decoded floats to + /// \param[in] input_vector encoded integers (after bit unpacking and unFOR) + /// \param[in] vector_info metadata with exponent, factor, decoding params + /// \tparam TargetType the type that is used to store the output. + /// May not be a narrowing conversion from T. + template + static void DecodeVector(TargetType* output_vector, + arrow::util::span input_vector, + AlpEncodedVectorInfo vector_info); + + /// \brief Helper struct to encapsulate the result from BitPackIntegers + struct BitPackingResult { + arrow::internal::StaticVector + packed_integers; + uint8_t bit_width = 0; + uint64_t bit_packed_size = 0; + }; + + /// \brief Bitpack the encoded integers as the final step of compression + /// + /// Calculates the minimum bit width required and packs each value + /// using that many bits, resulting in tightly packed binary data. + /// + /// \param[in] integers the encoded integers (after FOR subtraction) + /// \param[in] min_max_diff the difference between max and min values, + /// used to determine the required bit width + /// \return a BitPackingResult with packed bytes, bit width, and packed size + static BitPackingResult BitPackIntegers( + arrow::util::span integers, uint64_t min_max_diff); + + /// \brief Unpack bitpacked integers back to their original representation + /// + /// The result is still encoded (FOR applied) and needs decoding to get floats. + /// + /// \param[in] packed_integers the bitpacked integer data to unpack + /// \param[in] vector_info metadata with bit width and unpacking parameters + /// \return a vector of unpacked integers (still with frame of reference) + static arrow::internal::StaticVector BitUnpackIntegers( + arrow::util::span packed_integers, + AlpEncodedVectorInfo vector_info); + + /// \brief Patch exceptions into the decoded output vector + /// + /// Replaces placeholder values at exception positions with the original + /// floating point values that could not be losslessly encoded. + /// + /// \param[out] output the decoded output vector to patch exceptions into + /// \param[in] exceptions the original floats stored as exceptions + /// \param[in] exception_positions indices where exceptions should be placed + /// \tparam TargetType the type that is used to store the output. + /// May not be a narrowing conversion from T. + template + static void PatchExceptions(TargetType* output, + arrow::util::span exceptions, + arrow::util::span exception_positions); +}; + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/alp/AlpConstants.h b/cpp/src/arrow/util/alp/AlpConstants.h new file mode 100644 index 00000000000..5a44e8f7a95 --- /dev/null +++ b/cpp/src/arrow/util/alp/AlpConstants.h @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Constants and type traits for ALP compression + +#pragma once + +#include + +#include "arrow/util/logging.h" + +namespace arrow { +namespace util { +namespace alp { + +// ---------------------------------------------------------------------- +// AlpConstants + +/// \brief Constants used throughout ALP compression +class AlpConstants { + public: + /// Number of elements compressed together as a unit. Fixed for compatibility. + static constexpr uint64_t kAlpVectorSize = 1024; + + /// Number of elements to use when determining sampling parameters. + static constexpr uint64_t kSamplerVectorSize = 4096; + + /// Total number of elements in a rowgroup for sampling purposes. + static constexpr uint64_t kSamplerRowgroupSize = 122880; + + /// Number of samples to collect per vector during the sampling phase. + static constexpr uint64_t kSamplerSamplesPerVector = 256; + + /// Number of sample vectors to collect per rowgroup. + static constexpr uint64_t kSamplerSampleVectorsPerRowgroup = 8; + + /// Version number for the ALP compression format. + static constexpr uint64_t kAlpVersion = 1; + + /// Type used to store exception positions within a compressed vector. + using PositionType = uint16_t; + + /// Threshold for early exit during sampling when compression quality is poor. + static constexpr uint8_t kSamplingEarlyExitThreshold = 4; + + /// Maximum number of exponent-factor combinations to try during compression. + static constexpr uint8_t kMaxCombinations = 5; + + /// Loop unroll factor for tight loops in ALP compression/decompression. + /// ALP has multiple tight loops that profit from unrolling. Setting this + /// might affect performance, so benchmarking is recommended. + static constexpr uint64_t kLoopUnrolls = 4; + + /// \brief Get power of ten as uint64_t + /// + /// \param[in] power the exponent (must be <= 19) + /// \return 10^power as uint64_t + static uint64_t PowerOfTenUB8(const uint8_t power) { + ARROW_DCHECK(power <= 19) << "power_out_of_range: " << static_cast(power); + static constexpr uint64_t kTable[20] = {1, + 10, + 100, + 1'000, + 10'000, + 100'000, + 1'000'000, + 10'000'000, + 100'000'000, + 1'000'000'000, + 10'000'000'000, + 100'000'000'000, + 1'000'000'000'000, + 10'000'000'000'000, + 100'000'000'000'000, + 1'000'000'000'000'000, + 10'000'000'000'000'000, + 100'000'000'000'000'000, + 1'000'000'000'000'000'000, + 10'000'000'000'000'000'000ULL}; + + return kTable[power]; + } + + /// \brief Get power of ten as float + /// + /// \param[in] power the exponent (must be in range [-10, 10]) + /// \return 10^power as float + static float PowerOfTenFloat(int8_t power) { + ARROW_DCHECK(power >= -10 && power <= 10) + << "power_out_of_range: " << static_cast(power); + static constexpr float kTable[21] = { + 0.0000000001F, 0.000000001F, 0.00000001F, 0.0000001F, 0.000001F, + 0.00001F, 0.0001F, 0.001F, 0.01F, 0.1F, + 1.0F, 10.0F, 100.0F, 1000.0F, 10000.0F, + 100000.0F, 1000000.0F, 10000000.0F, 100000000.0F, + 1000000000.0F, 10000000000.0F}; + + return kTable[power + 10]; + } + + /// \brief Get power of ten as double + /// + /// \param[in] power the exponent (must be in range [-20, 20]) + /// \return 10^power as double + static double PowerOfTenDouble(const int8_t power) { + ARROW_DCHECK(power >= -20 && power <= 20) + << "power_out_of_range: " << static_cast(power); + static constexpr double kTable[41] = { + 0.00000000000000000001, + 0.0000000000000000001, + 0.000000000000000001, + 0.00000000000000001, + 0.0000000000000001, + 0.000000000000001, + 0.00000000000001, + 0.0000000000001, + 0.000000000001, + 0.00000000001, + 0.0000000001, + 0.000000001, + 0.00000001, + 0.0000001, + 0.000001, + 0.00001, + 0.0001, + 0.001, + 0.01, + 0.1, + 1.0, + 10.0, + 100.0, + 1000.0, + 10000.0, + 100000.0, + 1000000.0, + 10000000.0, + 100000000.0, + 1000000000.0, + 10000000000.0, + 100000000000.0, + 1000000000000.0, + 10000000000000.0, + 100000000000000.0, + 1000000000000000.0, + 10000000000000000.0, + 100000000000000000.0, + 1000000000000000000.0, + 10000000000000000000.0, + 100000000000000000000.0, + }; + return kTable[power + 20]; + } + + /// \brief Get factor as int64_t + /// + /// \param[in] power the exponent + /// \return 10^power as int64_t + static int64_t GetFactor(const int8_t power) { return PowerOfTenUB8(power); } +}; + +// ---------------------------------------------------------------------- +// AlpTypedConstants + +/// \brief Type-specific constants for ALP compression +/// \tparam FloatingPointType the floating point type (float or double) +template +struct AlpTypedConstants {}; + +/// \brief Type-specific constants for float +template <> +struct AlpTypedConstants { + /// Magic number used for fast rounding of floats to nearest integer: + /// rounded(n) = static_cast(n + kMagicNumber - kMagicNumber). + static constexpr float kMagicNumber = 12582912.0f; // 2^22 + 2^23 + + static constexpr uint8_t kMaxExponent = 10; + + /// Largest float value that can be safely converted to int32. + static constexpr float kEncodingUpperLimit = 2147483520.0f; + static constexpr float kEncodingLowerLimit = -2147483520.0f; + + /// \brief Get exponent multiplier + /// + /// \param[in] power the exponent + /// \return 10^power as float + static float GetExponent(const uint8_t power) { + return AlpConstants::PowerOfTenFloat(power); + } + + /// \brief Get factor multiplier + /// + /// \param[in] power the factor + /// \return 10^(-power) as float + static float GetFactor(const uint8_t power) { + // This double cast is necessary since subtraction on int8_t does not + // necessarily yield an int8_t. + return AlpConstants::PowerOfTenFloat( + static_cast(-static_cast(power))); + } + + using FloatingToExact = uint32_t; + using FloatingToSignedExact = int32_t; +}; + +/// \brief Type-specific constants for double +template <> +class AlpTypedConstants { + public: + /// Magic number used for fast rounding of doubles to nearest integer: + /// rounded(n) = static_cast(n + kMagicNumber - kMagicNumber). + static constexpr double kMagicNumber = 6755399441055744.0; // 2^51 + 2^52 + + static constexpr uint8_t kMaxExponent = 18; // 10^18 is the maximum int64 + + /// Largest double value that can be safely converted to int64. + static constexpr double kEncodingUpperLimit = 9223372036854774784.0; + static constexpr double kEncodingLowerLimit = -9223372036854774784.0; + + /// \brief Get exponent multiplier + /// + /// \param[in] power the exponent + /// \return 10^power as double + static double GetExponent(const uint8_t power) { + return AlpConstants::PowerOfTenDouble(power); + } + + /// \brief Get factor multiplier + /// + /// \param[in] power the factor + /// \return 10^(-power) as double + static double GetFactor(const uint8_t power) { + return AlpConstants::PowerOfTenDouble( + static_cast(-static_cast(power))); + } + + using FloatingToExact = uint64_t; + using FloatingToSignedExact = int64_t; +}; + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/alp/AlpSampler.cc b/cpp/src/arrow/util/alp/AlpSampler.cc new file mode 100644 index 00000000000..6a2c7a90dcf --- /dev/null +++ b/cpp/src/arrow/util/alp/AlpSampler.cc @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/AlpSampler.h" + +#include + +#include "arrow/util/alp/Alp.h" +#include "arrow/util/alp/AlpConstants.h" +#include "arrow/util/logging.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +// ---------------------------------------------------------------------- +// AlpSampler implementation + +template +AlpSampler::AlpSampler() + : sample_vector_size_(AlpConstants::kSamplerVectorSize), + rowgroup_size_(AlpConstants::kSamplerRowgroupSize), + samples_per_vector_(AlpConstants::kSamplerSamplesPerVector), + sample_vectors_per_rowgroup_(AlpConstants::kSamplerSampleVectorsPerRowgroup), + rowgroup_sample_jump_((rowgroup_size_ / sample_vectors_per_rowgroup_) / + sample_vector_size_) {} + +template +void AlpSampler::AddSample(arrow::util::span input) { + for (uint64_t i = 0; i < input.size(); i += sample_vector_size_) { + const uint64_t elements = std::min(input.size() - i, sample_vector_size_); + AddSampleVector({input.data() + i, elements}); + } +} + +template +void AlpSampler::AddSampleVector(arrow::util::span input) { + const bool must_skip_current_vector = + MustSkipSamplingFromCurrentVector(vectors_count_, vectors_sampled_count_, + input.size()); + + vectors_count_ += 1; + total_values_count_ += input.size(); + if (must_skip_current_vector) { + return; + } + + const AlpSamplingParameters sampling_params = GetAlpSamplingParameters(input.size()); + + // Slice: take first num_lookup_value elements. + std::vector current_vector_values( + input.begin(), + input.begin() + std::min(sampling_params.num_lookup_value, input.size())); + + // Stride: take every num_sampled_increments-th element. + std::vector current_vector_sample; + for (size_t i = 0; i < current_vector_values.size(); + i += sampling_params.num_sampled_increments) { + current_vector_sample.push_back(current_vector_values[i]); + } + sample_stored_ += current_vector_sample.size(); + + complete_vectors_sampled_.push_back(std::move(current_vector_values)); + rowgroup_sample_.push_back(std::move(current_vector_sample)); + vectors_sampled_count_++; +} + +template +typename AlpSampler::AlpSamplerResult AlpSampler::Finalize() { + ARROW_LOG(DEBUG) << "AlpSampler finalized: vectorsSampled=" << vectors_sampled_count_ + << "/" << vectors_count_ << " total" + << ", valuesSampled=" << sample_stored_ << "/" << total_values_count_ + << " total"; + + AlpSamplerResult result; + result.alp_preset = AlpCompression::CreateEncodingPreset(rowgroup_sample_); + + ARROW_LOG(DEBUG) << "AlpSampler preset: " << result.alp_preset.combinations.size() + << " exponent/factor combinations" + << ", estimatedSize=" << result.alp_preset.best_compressed_size + << " bytes"; + + return result; +} + +template +typename AlpSampler::AlpSamplingParameters AlpSampler::GetAlpSamplingParameters( + uint64_t num_current_vector_values) { + const uint64_t num_lookup_values = + std::min(num_current_vector_values, + static_cast(AlpConstants::kAlpVectorSize)); + // Sample equidistant values within a vector; jump a fixed number of values. + const uint64_t num_sampled_increments = + std::max(uint64_t{1}, static_cast(std::ceil( + static_cast(num_lookup_values) / + samples_per_vector_))); + const uint64_t num_sampled_values = + std::ceil(static_cast(num_lookup_values) / num_sampled_increments); + + ARROW_CHECK(num_sampled_values < AlpConstants::kAlpVectorSize) << "alp_sample_too_large"; + + return AlpSamplingParameters{num_lookup_values, num_sampled_increments, + num_sampled_values}; +} + +template +bool AlpSampler::MustSkipSamplingFromCurrentVector( + const uint64_t vectors_count, const uint64_t vectors_sampled_count, + const uint64_t current_vector_n_values) { + // Sample equidistant vectors; skip a fixed number of vectors. + const bool must_select_rowgroup_samples = (vectors_count % rowgroup_sample_jump_) == 0; + + // If we are not in the correct jump, do not take sample from this vector. + if (!must_select_rowgroup_samples) { + return true; + } + + // Do not take samples of non-complete vectors (usually the last one), + // except in the case of too little data. + if (current_vector_n_values < AlpConstants::kSamplerSamplesPerVector && + vectors_sampled_count != 0) { + return true; + } + return false; +} + +// ---------------------------------------------------------------------- +// Template instantiations + +template class AlpSampler; +template class AlpSampler; + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/alp/AlpSampler.h b/cpp/src/arrow/util/alp/AlpSampler.h new file mode 100644 index 00000000000..5b9fdb47d44 --- /dev/null +++ b/cpp/src/arrow/util/alp/AlpSampler.h @@ -0,0 +1,123 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// ALP sampler for collecting samples and creating encoding presets + +#pragma once + +#include +#include + +#include "arrow/util/alp/Alp.h" +#include "arrow/util/span.h" + +namespace arrow { +namespace util { +namespace alp { + +// ---------------------------------------------------------------------- +// AlpSampler + +/// \class AlpSampler +/// \brief Collects samples from data to be compressed with ALP +/// +/// Usage: Call AddSample() or AddSampleVector() multiple times to collect +/// samples, then call Finalize() to retrieve the resulting preset. +/// +/// \tparam T the floating point type (float or double) to sample +template +class AlpSampler { + public: + /// \brief Default constructor + AlpSampler(); + + /// \brief Helper struct containing the preset for ALP compression + struct AlpSamplerResult { + AlpEncodingPreset alp_preset; + }; + + /// \brief Add a sample of arbitrary size + /// + /// The sample is internally separated into vectors on which AddSampleVector() + /// is called. + /// + /// \param[in] input the input data to sample from + void AddSample(arrow::util::span input); + + /// \brief Add a single vector as a sample + /// + /// \param[in] input the input vector to add. + /// Size should be <= AlpConstants::kAlpVectorSize. + void AddSampleVector(arrow::util::span input); + + /// \brief Finalize sampling and generate the encoding preset + /// + /// \return an AlpSamplerResult containing the generated encoding preset + AlpSamplerResult Finalize(); + + private: + /// \brief Helper struct to encapsulate settings used for sampling + struct AlpSamplingParameters { + uint64_t num_lookup_value; + uint64_t num_sampled_increments; + uint64_t num_sampled_values; + }; + + /// \brief Calculate sampling parameters for the current vector + /// + /// \param[in] num_current_vector_values number of values in current vector + /// \return the sampling parameters to use + AlpSamplingParameters GetAlpSamplingParameters(uint64_t num_current_vector_values); + + /// \brief Check if the current vector must be ignored for sampling + /// + /// \param[in] vectors_count the total number of vectors processed so far + /// \param[in] vectors_sampled_count the number of vectors sampled so far + /// \param[in] num_current_vector_values number of values in current vector + /// \return true if the current vector should be skipped, false otherwise + bool MustSkipSamplingFromCurrentVector(uint64_t vectors_count, + uint64_t vectors_sampled_count, + uint64_t num_current_vector_values); + + /// Count of vectors that have been sampled + uint64_t vectors_sampled_count_ = 0; + /// Total count of values processed + uint64_t total_values_count_ = 0; + /// Total count of vectors processed + uint64_t vectors_count_ = 0; + /// Number of samples stored + uint64_t sample_stored_ = 0; + /// Samples collected from current rowgroup + std::vector> rowgroup_sample_; + + /// Complete vectors sampled + std::vector> complete_vectors_sampled_; + /// Size of each sample vector + const uint64_t sample_vector_size_; + /// Size of each rowgroup + const uint64_t rowgroup_size_; + /// Number of samples to take per vector + const uint64_t samples_per_vector_; + /// Number of vectors to sample per rowgroup + const uint64_t sample_vectors_per_rowgroup_; + /// Jump interval for rowgroup sampling + const uint64_t rowgroup_sample_jump_; +}; + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/alp/AlpWrapper.cc b/cpp/src/arrow/util/alp/AlpWrapper.cc new file mode 100644 index 00000000000..363b046a388 --- /dev/null +++ b/cpp/src/arrow/util/alp/AlpWrapper.cc @@ -0,0 +1,310 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/alp/AlpWrapper.h" + +#include +#include + +#include "arrow/util/alp/Alp.h" +#include "arrow/util/alp/AlpConstants.h" +#include "arrow/util/alp/AlpSampler.h" +#include "arrow/util/logging.h" +#include "arrow/util/ubsan.h" + +namespace arrow { +namespace util { +namespace alp { + +namespace { + +// ---------------------------------------------------------------------- +// CompressionBlockHeader + +/// \brief Header structure for ALP compression blocks +/// +/// Contains metadata required to decompress the data. +/// +/// Serialization format (version 1): +/// +/// +---------------------------------------------------+ +/// | CompressionBlockHeader (40 bytes) | +/// +---------------------------------------------------+ +/// | Offset | Field | Size | +/// +---------+---------------------+-------------------+ +/// | 0 | version | 8 bytes (uint64) | +/// | 8 | compressed_size | 8 bytes (uint64) | +/// | 16 | num_elements | 8 bytes (uint64) | +/// | 24 | vector_size | 8 bytes (uint64) | +/// | 32 | compression_mode | 4 bytes (enum) | +/// | 36 | bit_pack_layout | 4 bytes (enum) | +/// +---------------------------------------------------+ +/// +/// \note version must remain the first field to allow reading the rest +/// of the header based on version number. +struct CompressionBlockHeader { + /// Version number. Must remain the first field for version-based parsing. + uint64_t version = 0; + /// Size of the compressed data in bytes (includes header). + uint64_t compressed_size = 0; + /// Number of elements in the compressed data. + uint64_t num_elements = 0; + /// Vector size used for compression. + /// Must be AlpConstants::kAlpVectorSize for decompression. + uint64_t vector_size = 0; + /// Compression mode (currently only kAlp is supported). + AlpMode compression_mode = AlpMode::kAlp; + /// Bit packing layout used for bitpacking. + AlpBitPackLayout bit_pack_layout = AlpBitPackLayout::kNormal; + + /// \brief Get the size in bytes of the CompressionBlockHeader for a version + /// + /// \param[in] v the version number + /// \return the size in bytes + static size_t GetSizeForVersion(uint64_t v) { + size_t size; + if (v == 1) { + size = sizeof(version) + sizeof(compressed_size) + sizeof(num_elements) + + sizeof(vector_size) + sizeof(compression_mode) + sizeof(bit_pack_layout); + } else { + ARROW_CHECK(false) << "unknown_version: " << v; + } + return size; + } + + /// \brief Check whether the given version is valid + /// + /// \param[in] v the version to check + /// \return the version if valid, otherwise asserts + static uint64_t IsValidVersion(uint64_t v) { + if (v == 1) { + return v; + } + ARROW_CHECK(false) << "invalid_version: " << v; + return 0; // Unreachable, but silences warning. + } +}; + +} // namespace + +// ---------------------------------------------------------------------- +// AlpWrapper::CompressionBlockHeader definition + +template +struct AlpWrapper::CompressionBlockHeader : public ::arrow::util::alp::CompressionBlockHeader { +}; + +// ---------------------------------------------------------------------- +// AlpWrapper implementation + +template +typename AlpWrapper::CompressionBlockHeader AlpWrapper::LoadHeader( + const char* comp, size_t comp_size) { + CompressionBlockHeader header{}; + ARROW_CHECK(comp_size > sizeof(header.version)) + << "alp_loadHeader_compSize_too_small_for_header_version"; + uint64_t version; + std::memcpy(&version, comp, sizeof(header.version)); + ::arrow::util::alp::CompressionBlockHeader::IsValidVersion(version); + ARROW_CHECK(comp_size >= ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version)) + << "alp_loadHeader_compSize_too_small"; + std::memcpy(&header, comp, + ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version)); + return header; +} + +template +void AlpWrapper::Encode(const T* decomp, size_t decomp_size, char* comp, + size_t* comp_size, std::optional enforce_mode) { + ARROW_CHECK(decomp_size % sizeof(T) == 0) << "alp_encode_input_must_be_multiple_of_T"; + const uint64_t element_count = decomp_size / sizeof(T); + const uint64_t version = ::arrow::util::alp::CompressionBlockHeader::IsValidVersion( + AlpConstants::kAlpVersion); + + AlpSampler sampler; + sampler.AddSample({decomp, element_count}); + auto sampling_result = sampler.Finalize(); + + // Make room to store header afterwards. + char* encoded_header = comp; + comp += ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version); + const uint64_t remaining_compressed_size = + *comp_size - ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version); + + const CompressionProgress compression_progress = + EncodeAlp(decomp, element_count, comp, remaining_compressed_size, + sampling_result.alp_preset); + + CompressionBlockHeader header{}; + header.version = version; + header.compressed_size = + ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version) + + compression_progress.num_compressed_bytes_produced; + header.num_elements = decomp_size / sizeof(T); + header.vector_size = AlpConstants::kAlpVectorSize; + header.compression_mode = AlpMode::kAlp; + header.bit_pack_layout = AlpBitPackLayout::kNormal; + + std::memcpy(encoded_header, &header, + ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version)); + *comp_size = header.compressed_size; +} + +template +template +void AlpWrapper::Decode(TargetType* decomp, size_t* decomp_size, const char* comp, + size_t comp_size) { + const CompressionBlockHeader header = LoadHeader(comp, comp_size); + ARROW_CHECK(header.vector_size == AlpConstants::kAlpVectorSize) + << "unsupported_vector_size: " << header.vector_size; + + if (header.num_elements * sizeof(TargetType) > *decomp_size) { + *decomp_size = 0; + return; + } + + const uint64_t elements_to_decode = header.num_elements; + const char* compression_body = + comp + ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(header.version); + const uint64_t compression_body_size = + comp_size - + ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(header.version); + + ARROW_CHECK(header.compression_mode == AlpMode::kAlp) << "alp_decode_unsupported_mode"; + + uint64_t elements_decoded = + DecodeAlp(decomp, elements_to_decode, compression_body, compression_body_size, + header.bit_pack_layout) + .num_decompressed_elements_produced; + *decomp_size = elements_decoded * sizeof(TargetType); +} + +template void AlpWrapper::Decode(float* decomp, size_t* decomp_size, + const char* comp, size_t comp_size); +template void AlpWrapper::Decode(double* decomp, size_t* decomp_size, + const char* comp, size_t comp_size); +template void AlpWrapper::Decode(double* decomp, size_t* decomp_size, + const char* comp, size_t comp_size); + +template +template +uint64_t AlpWrapper::GetDecompressedSize(const char* comp, uint64_t comp_size) { + const CompressionBlockHeader header = LoadHeader(comp, comp_size); + return header.num_elements * sizeof(TargetType); +} + +template uint64_t AlpWrapper::GetDecompressedSize(const char* comp, + uint64_t comp_size); +template uint64_t AlpWrapper::GetDecompressedSize(const char* comp, + uint64_t comp_size); +template uint64_t AlpWrapper::GetDecompressedSize(const char* comp, + uint64_t comp_size); + +template +uint64_t AlpWrapper::GetMaxCompressedSize(uint64_t decomp_size) { + ARROW_CHECK(decomp_size % sizeof(T) == 0) + << "alp_decompressed_size_not_multiple_of_T"; + const uint64_t element_count = decomp_size / sizeof(T); + const uint64_t version = ::arrow::util::alp::CompressionBlockHeader::IsValidVersion( + AlpConstants::kAlpVersion); + uint64_t max_alp_size = + ::arrow::util::alp::CompressionBlockHeader::GetSizeForVersion(version); + // Add header sizes. + max_alp_size += + sizeof(AlpEncodedVectorInfo) * + std::ceil(static_cast(element_count) / AlpConstants::kAlpVectorSize); + // Worst case: everything is an exception, except two values that are chosen + // with large difference to make FOR encoding for placeholders impossible. + // Values/placeholders. + max_alp_size += element_count * sizeof(T); + // Exceptions. + max_alp_size += element_count * sizeof(T); + // Exception positions. + max_alp_size += element_count * sizeof(AlpConstants::PositionType); + + return max_alp_size; +} + +template +auto AlpWrapper::EncodeAlp(const T* decomp, uint64_t element_count, char* comp, + size_t comp_size, const AlpEncodingPreset& combinations) + -> CompressionProgress { + uint64_t output_offset = 0; + uint64_t input_offset = 0; + uint64_t remaining_output_size = comp_size; + + for (uint64_t remaining_elements = element_count; remaining_elements > 0; + remaining_elements -= std::min(AlpConstants::kAlpVectorSize, remaining_elements)) { + const uint64_t elements_to_encode = + std::min(AlpConstants::kAlpVectorSize, remaining_elements); + const AlpEncodedVector encoded_vector = AlpCompression::CompressVector( + decomp + input_offset, elements_to_encode, combinations); + + const uint64_t compressed_vector_size = encoded_vector.GetStoredSize(); + if (compressed_vector_size == 0 || compressed_vector_size > remaining_output_size) { + return CompressionProgress{0, 0}; + } + + ARROW_CHECK(encoded_vector.GetStoredSize() <= remaining_output_size) + << "alp_encode_cannot_store_compressed_vector"; + + encoded_vector.Store({comp + output_offset, remaining_output_size}); + + remaining_output_size -= compressed_vector_size; + output_offset += compressed_vector_size; + input_offset += elements_to_encode; + } + return CompressionProgress{output_offset, input_offset}; +} + +template +template +auto AlpWrapper::DecodeAlp(TargetType* decomp, size_t decomp_element_count, + const char* comp, size_t comp_size, + AlpBitPackLayout bit_pack_layout) -> DecompressionProgress { + uint64_t input_offset = 0; + uint64_t output_offset = 0; + while (input_offset < comp_size && output_offset < decomp_element_count) { + // Use zero-copy view to avoid memory allocation and copying + const AlpEncodedVectorView encoded_view = + AlpEncodedVectorView::LoadView({comp + input_offset, comp_size - input_offset}); + const uint64_t compressed_size = encoded_view.GetStoredSize(); + const uint64_t element_count = encoded_view.vector_info.num_elements; + + ARROW_CHECK(output_offset + element_count <= decomp_element_count) + << "alp_decode_output_too_small: " << output_offset << " vs " << element_count + << " vs " << decomp_element_count; + + AlpCompression::DecompressVectorView(encoded_view, bit_pack_layout, + decomp + output_offset); + + input_offset += compressed_size; + output_offset += element_count; + } + + return DecompressionProgress{output_offset, input_offset}; +} + +// ---------------------------------------------------------------------- +// Template instantiations + +template class AlpWrapper; +template class AlpWrapper; + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/alp/AlpWrapper.h b/cpp/src/arrow/util/alp/AlpWrapper.h new file mode 100644 index 00000000000..19aa2a3a4bb --- /dev/null +++ b/cpp/src/arrow/util/alp/AlpWrapper.h @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// High-level wrapper interface for ALP compression + +#pragma once + +#include +#include + +#include "arrow/util/alp/Alp.h" + +namespace arrow { +namespace util { +namespace alp { + +// ---------------------------------------------------------------------- +// AlpWrapper + +/// \class AlpWrapper +/// \brief High-level interface for ALP compression +/// +/// AlpWrapper is an interface for Adaptive Lossless floating-Point Compression +/// (ALP) (https://dl.acm.org/doi/10.1145/3626717). For encoding, it samples +/// the data and applies decimal compression (Alp) to floating point values. +/// This class acts as a wrapper around the vector-based interfaces of +/// AlpSampler and Alp. +/// +/// \tparam T the floating point type (float or double) +template +class AlpWrapper { + public: + /// \brief Encode floating point values using ALP decimal compression + /// + /// \param[in] decomp pointer to the input that is to be encoded + /// \param[in] decomp_size size of decomp in bytes. + /// This needs to be a multiple of sizeof(T). + /// \param[out] comp pointer to the memory region we will encode into. + /// The caller is responsible for ensuring this is big enough. + /// \param[in,out] comp_size the actual size of the encoded data in bytes, + /// expects the size of comp as input. If this is too small, + /// this is set to 0 and we bail out. + /// \param[in] enforce_mode reserved for future use. + /// Currently only AlpMode::kAlp is supported. + static void Encode(const T* decomp, size_t decomp_size, char* comp, + size_t* comp_size, + std::optional enforce_mode = std::nullopt); + + /// \brief Decode floating point values + /// + /// \param[out] decomp pointer to the memory region we will decode into. + /// The caller is responsible for ensuring this is big enough. + /// \param[in,out] decomp_size the actual size of decoded data in bytes, + /// expects the decomp size as input. + /// \param[in] comp pointer to the input that is to be decoded + /// \param[in] comp_size size of the input in bytes. + /// \tparam TargetType the type that is used to store the output. + /// May not be a narrowing conversion from T. + template + static void Decode(TargetType* decomp, size_t* decomp_size, const char* comp, + size_t comp_size); + + /// \brief Get the decompressed size of a compression block + /// + /// Get the size of a compression block encoded previously with + /// AlpWrapper::Encode(). + /// + /// \param[in] comp start of the memory region containing the compression block + /// \param[in] comp_size size of the compression block + /// \return the decompressed size of the block, in bytes + /// \tparam TargetType the type that is used to store the output. + /// May not be a narrowing conversion from T. + template + static uint64_t GetDecompressedSize(const char* comp, uint64_t comp_size); + + /// \brief Get the maximum compressed size of an uncompressed buffer + /// + /// \param[in] decomp_size the size of the uncompressed buffer in bytes + /// \return the maximum size of the compressed buffer + static uint64_t GetMaxCompressedSize(uint64_t decomp_size); + + private: + struct CompressionBlockHeader; + + /// \brief Tracks the progress of a compression operation + /// + /// Used to report how much data was consumed and produced during encoding. + struct CompressionProgress { + /// Number of compressed bytes written to output + uint64_t num_compressed_bytes_produced = 0; + /// Number of input elements consumed + uint64_t num_uncompressed_elements_taken = 0; + }; + + /// \brief Tracks the progress of a decompression operation + /// + /// Used to report how much data was consumed and produced during decoding. + struct DecompressionProgress { + /// Number of decompressed elements written + uint64_t num_decompressed_elements_produced = 0; + /// Number of compressed bytes consumed + uint64_t num_compressed_bytes_taken = 0; + }; + + /// \brief Compress a buffer using the ALP variant + /// + /// \param[in] decomp array of floating point numbers to compress + /// \param[in] element_count the number of floating point numbers + /// \param[out] comp the buffer to be compressed into + /// \param[in] comp_size the size of the compression buffer + /// \param[in] combinations the encoding preset to use + /// \return the compression progress + static CompressionProgress EncodeAlp(const T* decomp, uint64_t element_count, + char* comp, size_t comp_size, + const AlpEncodingPreset& combinations); + + /// \brief Decompress a buffer using the ALP variant + /// + /// \param[out] decomp the buffer to be decompressed into + /// \param[in] decomp_element_count the number of floats to decompress + /// \param[in] comp the compressed buffer to be decompressed + /// \param[in] comp_size the size of the compressed data + /// \param[in] bit_pack_layout the bit packing layout used + /// \return the decompression progress + /// \tparam TargetType the type that is used to store the output. + /// May not be a narrowing conversion from T. + template + static DecompressionProgress DecodeAlp(TargetType* decomp, size_t decomp_element_count, + const char* comp, size_t comp_size, + AlpBitPackLayout bit_pack_layout); + + /// \brief Load the CompressionBlockHeader from compressed data + /// + /// \param[in] comp the compressed buffer + /// \param[in] comp_size the size of the compressed data + /// \return the CompressionBlockHeader from comp + static CompressionBlockHeader LoadHeader(const char* comp, size_t comp_size); +}; + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/alp/alp_test.cc b/cpp/src/arrow/util/alp/alp_test.cc new file mode 100644 index 00000000000..f3a1bfd2d4a --- /dev/null +++ b/cpp/src/arrow/util/alp/alp_test.cc @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include + +#include "arrow/testing/gtest_util.h" +#include "arrow/util/alp/Alp.h" +#include "arrow/util/alp/AlpConstants.h" +#include "arrow/util/alp/AlpSampler.h" +#include "arrow/util/alp/CompressFloatAlp.h" +#include "arrow/util/bit_stream_utils_internal.h" +#include "arrow/util/bpacking_internal.h" + +namespace arrow { +namespace util { +namespace alp { + +// ============================================================================ +// ALP Constants Tests +// ============================================================================ + +TEST(AlpConstantsTest, SamplerConstants) { + EXPECT_GT(AlpConstants::kSamplerVectorSize, 0); + EXPECT_GT(AlpConstants::kSamplerRowgroupSize, 0); + EXPECT_GT(AlpConstants::kSamplerSamplesPerVector, 0); + EXPECT_EQ(AlpConstants::kAlpVersion, 1); +} + +// ============================================================================ +// ALP Compression Tests (Float) +// ============================================================================ + +class AlpCompressionFloatTest : public ::testing::Test { + protected: + void TestCompressDecompressFloat(const std::vector& input) { + AlpCompression compressor; + + // Compress + AlpEncodingPreset preset{}; // Default preset + auto encoded = compressor.CompressVector(input.data(), input.size(), preset); + + // Decompress + std::vector output(input.size()); + compressor.DecompressVector(encoded, AlpBitPackLayout::kNormal, output.data()); + + // Verify + ASSERT_EQ(output.size(), input.size()); + for (size_t i = 0; i < input.size(); ++i) { + EXPECT_FLOAT_EQ(output[i], input[i]) << "Mismatch at index " << i; + } + } +}; + +TEST_F(AlpCompressionFloatTest, SimpleSequence) { + std::vector input(64); + for (size_t i = 0; i < input.size(); ++i) { + input[i] = static_cast(i + 1); + } + TestCompressDecompressFloat(input); +} + +TEST_F(AlpCompressionFloatTest, DecimalValues) { + std::vector input(64); + for (size_t i = 0; i < input.size(); ++i) { + input[i] = static_cast(i) + 0.5f; + } + TestCompressDecompressFloat(input); +} + +TEST_F(AlpCompressionFloatTest, SmallValues) { + std::vector input(64); + for (size_t i = 0; i < input.size(); ++i) { + input[i] = 0.001f * (i + 1); + } + TestCompressDecompressFloat(input); +} + +TEST_F(AlpCompressionFloatTest, MixedValues) { + std::vector input = {100.5f, 200.25f, 300.125f, 400.0625f, + 500.03125f, 600.015625f, 700.0078125f, + 800.00390625f}; + TestCompressDecompressFloat(input); +} + +TEST_F(AlpCompressionFloatTest, RandomValues) { + std::mt19937 rng(42); + std::uniform_real_distribution dist(0.0f, 1000.0f); + + std::vector input(64); + for (auto& v : input) { + v = dist(rng); + } + + TestCompressDecompressFloat(input); +} + +// ============================================================================ +// ALP Compression Tests (Double) +// ============================================================================ + +class AlpCompressionDoubleTest : public ::testing::Test { + protected: + void TestCompressDecompressDouble(const std::vector& input) { + AlpCompression compressor; + + // Compress + AlpEncodingPreset preset{}; // Default preset + auto encoded = compressor.CompressVector(input.data(), input.size(), preset); + + // Decompress + std::vector output(input.size()); + compressor.DecompressVector(encoded, AlpBitPackLayout::kNormal, output.data()); + + // Verify + ASSERT_EQ(output.size(), input.size()); + for (size_t i = 0; i < input.size(); ++i) { + EXPECT_DOUBLE_EQ(output[i], input[i]) << "Mismatch at index " << i; + } + } +}; + +TEST_F(AlpCompressionDoubleTest, SimpleSequence) { + std::vector input(64); + for (size_t i = 0; i < input.size(); ++i) { + input[i] = static_cast(i + 1); + } + TestCompressDecompressDouble(input); +} + +TEST_F(AlpCompressionDoubleTest, HighPrecision) { + std::vector input(64); + for (size_t i = 0; i < input.size(); ++i) { + input[i] = 1.123456789 * (i + 1); + } + TestCompressDecompressDouble(input); +} + +TEST_F(AlpCompressionDoubleTest, VerySmallValues) { + std::vector input(64); + for (size_t i = 0; i < input.size(); ++i) { + input[i] = 1e-10 * (i + 1); + } + TestCompressDecompressDouble(input); +} + +// ============================================================================ +// Integration Tests +// ============================================================================ + +TEST(AlpIntegrationTest, LargeFloatDataset) { + std::mt19937 rng(12345); + std::uniform_real_distribution dist(-1000.0f, 1000.0f); + + std::vector input(1024); + for (auto& v : input) { + v = dist(rng); + } + + AlpCompression compressor; + AlpEncodingPreset preset{}; + auto encoded = compressor.CompressVector(input.data(), input.size(), preset); + + std::vector output(input.size()); + compressor.DecompressVector(encoded, AlpBitPackLayout::kNormal, output.data()); + + for (size_t i = 0; i < input.size(); ++i) { + EXPECT_FLOAT_EQ(output[i], input[i]); + } +} + +TEST(AlpIntegrationTest, LargeDoubleDataset) { + std::mt19937 rng(12345); + std::uniform_real_distribution dist(-1000.0, 1000.0); + + std::vector input(1024); + for (auto& v : input) { + v = dist(rng); + } + + AlpCompression compressor; + AlpEncodingPreset preset{}; + auto encoded = compressor.CompressVector(input.data(), input.size(), preset); + + std::vector output(input.size()); + compressor.DecompressVector(encoded, AlpBitPackLayout::kNormal, output.data()); + + for (size_t i = 0; i < input.size(); ++i) { + EXPECT_DOUBLE_EQ(output[i], input[i]); + } +} + +} // namespace alp +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/util/small_vector.h b/cpp/src/arrow/util/small_vector.h index f371e647152..90dcb111a41 100644 --- a/cpp/src/arrow/util/small_vector.h +++ b/cpp/src/arrow/util/small_vector.h @@ -457,6 +457,22 @@ class StaticVectorImpl { } } + // Unsafe resize without initialization - use only when you will immediately + // overwrite the memory (e.g., before memcpy). Only safe for POD types. + void UnsafeResize(size_t n) { + const size_t old_size = storage_.size_; + if (n > storage_.size_) { + storage_.bump_size(n - old_size); + // No construction - caller must initialize! + } else { + auto* p = storage_.storage_ptr(); + for (size_t i = n; i < old_size; ++i) { + p[i].destroy(); + } + storage_.reduce_size(old_size - n); + } + } + private: template void init_by_copying(size_t n, InputIt src) { diff --git a/cpp/src/arrow/util/type_fwd.h b/cpp/src/arrow/util/type_fwd.h index b8934ecbd4c..5ba696104bb 100644 --- a/cpp/src/arrow/util/type_fwd.h +++ b/cpp/src/arrow/util/type_fwd.h @@ -55,7 +55,8 @@ struct Compression { LZ4_FRAME, LZO, BZ2, - LZ4_HADOOP + LZ4_HADOOP, + ALP }; }; diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index dc7d40d2a38..92a75bcbd2e 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -442,6 +442,7 @@ add_parquet_benchmark(bloom_filter_benchmark SOURCES bloom_filter_benchmark.cc add_parquet_benchmark(column_reader_benchmark) add_parquet_benchmark(column_io_benchmark) add_parquet_benchmark(encoding_benchmark) +add_parquet_benchmark(encoding_alp_benchmark) add_parquet_benchmark(level_conversion_benchmark) add_parquet_benchmark(metadata_benchmark) add_parquet_benchmark(page_index_benchmark SOURCES page_index_benchmark.cc diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 79b837f755c..9dbdabe9b2f 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -859,6 +859,7 @@ class ColumnReaderImplBase { switch (encoding) { case Encoding::PLAIN: case Encoding::BYTE_STREAM_SPLIT: + case Encoding::ALP: case Encoding::RLE: case Encoding::DELTA_BINARY_PACKED: case Encoding::DELTA_BYTE_ARRAY: diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 3ce2323d29a..885477e513c 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,9 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/byte_stream_split_internal.h" +#include "arrow/util/alp/Alp.h" +#include "arrow/util/alp/AlpConstants.h" +#include "arrow/util/alp/AlpWrapper.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging_internal.h" @@ -2323,6 +2327,124 @@ class ByteStreamSplitDecoder : public ByteStreamSplitDecoderBase +class AlpDecoder : public TypedDecoderImpl { + public: + using Base = TypedDecoderImpl; + using T = typename DType::c_type; + + explicit AlpDecoder(const ColumnDescriptor* descr) + : Base(descr, Encoding::ALP), current_offset_{0}, needs_decode_{false} { + static_assert(std::is_same::value || std::is_same::value, + "ALP only supports float and double types"); + } + + void SetData(int num_values, const uint8_t* data, int len) final { + Base::SetData(num_values, data, len); + current_offset_ = 0; + needs_decode_ = (len > 0 && num_values > 0); + decoded_buffer_.clear(); + } + + int Decode(T* buffer, int max_values) override { + // Fast path: decode directly into output buffer if requesting all values + if (needs_decode_ && max_values >= this->num_values_) { + size_t decompSize = this->num_values_ * sizeof(T); + ::arrow::util::alp::AlpWrapper::Decode( + buffer, &decompSize, + reinterpret_cast(this->data_), this->len_); + + const int decoded = this->num_values_; + this->num_values_ = 0; + needs_decode_ = false; + return decoded; + } + + // Slow path: partial read - decode to intermediate buffer + // ALP Bit unpacker needs batches of 64 + if (needs_decode_) { + decoded_buffer_.resize(this->num_values_); + size_t decompSize = this->num_values_ * sizeof(T); + ::arrow::util::alp::AlpWrapper::Decode( + decoded_buffer_.data(), &decompSize, + reinterpret_cast(this->data_), this->len_); + needs_decode_ = false; + } + + // Copy from intermediate buffer + const int values_to_decode = std::min( + max_values, + static_cast(decoded_buffer_.size() - current_offset_)); + + if (values_to_decode > 0) { + std::memcpy(buffer, decoded_buffer_.data() + current_offset_, + values_to_decode * sizeof(T)); + current_offset_ += values_to_decode; + this->num_values_ -= values_to_decode; + } + + return values_to_decode; + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::Accumulator* builder) override { + const int values_to_decode = num_values - null_count; + if (ARROW_PREDICT_FALSE(this->num_values_ < values_to_decode)) { + ParquetException::EofException("ALP DecodeArrow: Not enough values available. " + "Available: " + std::to_string(this->num_values_) + + ", Requested: " + std::to_string(values_to_decode)); + } + + // Decode if needed (DecodeArrow always needs intermediate buffer for nulls) + if (needs_decode_) { + decoded_buffer_.resize(this->num_values_); + size_t decompSize = this->num_values_ * sizeof(T); + ::arrow::util::alp::AlpWrapper::Decode( + decoded_buffer_.data(), &decompSize, + reinterpret_cast(this->data_), this->len_); + needs_decode_ = false; + } + + if (null_count == 0) { + // Fast path: no nulls + PARQUET_THROW_NOT_OK(builder->AppendValues( + decoded_buffer_.data() + current_offset_, values_to_decode)); + current_offset_ += values_to_decode; + this->num_values_ -= values_to_decode; + return values_to_decode; + } else { + // Slow path: with nulls + int value_idx = 0; + for (int i = 0; i < num_values; ++i) { + if (::arrow::bit_util::GetBit(valid_bits, valid_bits_offset + i)) { + PARQUET_THROW_NOT_OK(builder->Append(decoded_buffer_[current_offset_ + value_idx])); + ++value_idx; + } else { + PARQUET_THROW_NOT_OK(builder->AppendNull()); + } + } + current_offset_ += values_to_decode; + this->num_values_ -= values_to_decode; + return values_to_decode; + } + } + + int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits, + int64_t valid_bits_offset, + typename EncodingTraits::DictAccumulator* builder) override { + ParquetException::NYI("DecodeArrow to DictAccumulator for ALP"); + } + + private: + std::vector decoded_buffer_; + size_t current_offset_; + bool needs_decode_; +}; + } // namespace // ---------------------------------------------------------------------- @@ -2369,6 +2491,15 @@ std::unique_ptr MakeDecoder(Type::type type_num, Encoding::type encodin "BYTE_STREAM_SPLIT only supports FLOAT, DOUBLE, INT32, INT64 " "and FIXED_LEN_BYTE_ARRAY"); } + } else if (encoding == Encoding::ALP) { + switch (type_num) { + case Type::FLOAT: + return std::make_unique>(descr); + case Type::DOUBLE: + return std::make_unique>(descr); + default: + throw ParquetException("ALP encoding only supports FLOAT and DOUBLE"); + } } else if (encoding == Encoding::DELTA_BINARY_PACKED) { switch (type_num) { case Type::INT32: diff --git a/cpp/src/parquet/encoder.cc b/cpp/src/parquet/encoder.cc index 04f079ce70c..ad9d1cd64eb 100644 --- a/cpp/src/parquet/encoder.cc +++ b/cpp/src/parquet/encoder.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,9 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/byte_stream_split_internal.h" +#include "arrow/util/alp/Alp.h" +#include "arrow/util/alp/AlpConstants.h" +#include "arrow/util/alp/AlpWrapper.h" #include "arrow/util/checked_cast.h" #include "arrow/util/hashing.h" #include "arrow/util/int_util_overflow.h" @@ -995,6 +999,90 @@ class ByteStreamSplitEncoder : public ByteStreamSplitEncoderBase +class AlpEncoder : public EncoderImpl, virtual public TypedEncoder { + public: + using T = typename DType::c_type; + using ArrowType = typename EncodingTraits::ArrowType; + using TypedEncoder::Put; + + explicit AlpEncoder(const ColumnDescriptor* descr, + ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) + : EncoderImpl(descr, Encoding::ALP, pool), + sink_{pool} { + static_assert(std::is_same::value || std::is_same::value, + "ALP only supports float and double types"); + } + + int64_t EstimatedDataEncodedSize() override { return sink_.length(); } + + std::shared_ptr FlushValues() override { + if (sink_.length() == 0) { + // Empty buffer case + PARQUET_ASSIGN_OR_THROW(auto buf, sink_.Finish()); + return buf; + } + + // Call AlpWrapper::Encode() - it handles sampling, preset selection, and compression + const size_t decompSize = sink_.length(); + size_t compSize = ::arrow::util::alp::AlpWrapper::GetMaxCompressedSize(decompSize); + + PARQUET_ASSIGN_OR_THROW( + auto compressed_buffer, + ::arrow::AllocateResizableBuffer(compSize, this->memory_pool())); + + ::arrow::util::alp::AlpWrapper::Encode( + reinterpret_cast(sink_.data()), + decompSize, + reinterpret_cast(compressed_buffer->mutable_data()), + &compSize); + + PARQUET_THROW_NOT_OK(compressed_buffer->Resize(compSize)); + sink_.Reset(); + + return std::shared_ptr(std::move(compressed_buffer)); + } + + void Put(const T* buffer, int num_values) override { + if (num_values > 0) { + PARQUET_THROW_NOT_OK( + sink_.Append(reinterpret_cast(buffer), + num_values * static_cast(sizeof(T)))); + } + } + + void PutSpaced(const T* src, int num_values, const uint8_t* valid_bits, + int64_t valid_bits_offset) override { + if (valid_bits != NULLPTR) { + PARQUET_ASSIGN_OR_THROW(auto buffer, ::arrow::AllocateBuffer(num_values * sizeof(T), + this->memory_pool())); + T* data = buffer->template mutable_data_as(); + const int num_valid_values = ::arrow::util::internal::SpacedCompress( + src, num_values, valid_bits, valid_bits_offset, data); + Put(data, num_valid_values); + } else { + Put(src, num_values); + } + } + + void Put(const ::arrow::Array& values) override { + if (values.type_id() != ArrowType::type_id) { + throw ParquetException(std::string() + "direct put from " + + values.type()->ToString() + " not supported"); + } + const auto& data = *values.data(); + this->PutSpaced(data.GetValues(1), + static_cast(data.length), data.GetValues(0, 0), + data.offset); + } + + private: + ::arrow::BufferBuilder sink_; +}; + // ---------------------------------------------------------------------- // DELTA_BINARY_PACKED encoder @@ -1816,6 +1904,15 @@ std::unique_ptr MakeEncoder(Type::type type_num, Encoding::type encodin "BYTE_STREAM_SPLIT only supports FLOAT, DOUBLE, INT32, INT64 " "and FIXED_LEN_BYTE_ARRAY"); } + } else if (encoding == Encoding::ALP) { + switch (type_num) { + case Type::FLOAT: + return std::make_unique>(descr, pool); + case Type::DOUBLE: + return std::make_unique>(descr, pool); + default: + throw ParquetException("ALP encoding only supports FLOAT and DOUBLE"); + } } else if (encoding == Encoding::DELTA_BINARY_PACKED) { switch (type_num) { case Type::INT32: diff --git a/cpp/src/parquet/encoding_alp_benchmark.cc b/cpp/src/parquet/encoding_alp_benchmark.cc new file mode 100644 index 00000000000..7bea2a64914 --- /dev/null +++ b/cpp/src/parquet/encoding_alp_benchmark.cc @@ -0,0 +1,1824 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/buffer.h" +#include "arrow/util/alp/AlpWrapper.h" +#include "arrow/util/compression.h" +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" + +// This file benchmarks multiple encoding schemes for floating point values in +// Parquet. Structure mirrors Snowflake's FloatComprBenchmark.cpp +// +// It evaluates: +// 1) Compression Ratio +// 2) Encoding Speed +// 3) Decoding Speed +// +// Encoding schemes: +// 1) ALP encoding +// 2) ByteStreamSplit encoding +// 3) ZSTD compression +// +// On synthetic datasets: +// 1) Constant Value +// 2) Increasing values +// 3) Small Range decimal +// 4) Range decimal +// 5) Large Range decimal +// 6) Random values +// +// And real-world datasets: +// 1) floatingpoint_spotify1.csv (9 columns) +// 2) floatingpoint_spotify2.csv (9 columns) +// 3) floatingpoint_citytemperature.csv (1 column) +// 4) floatingpoint_poi.csv (2 columns) +// 5) floatingpoint_birdmigration.csv (1 column) +// 6) floatingpoint_commongovernment.csv (3 columns) +// 7) floatingpoint_arade.csv (4 columns) +// 8) floatingpoint_num_brain.csv (1 column) +// 9) floatingpoint_num_comet.csv (1 column) +// 10) floatingpoint_num_control.csv (1 column) +// 11) floatingpoint_num_plasma.csv (1 column) +// 12) floatingpoint_obs_error.csv (1 column) +// 13) floatingpoint_obs_info.csv (1 column) +// 14) floatingpoint_obs_spitzer.csv (1 column) +// 15) floatingpoint_obs_temp.csv (1 column) +// 16) floatingpoint_msg_bt.csv (1 column) +// 17) floatingpoint_msg_lu.csv (1 column) +// 18) floatingpoint_msg_sp.csv (1 column) +// 19) floatingpoint_msg_sppm.csv (1 column) +// 20) floatingpoint_msg_sweep3d.csv (1 column) + +namespace parquet { + +using schema::PrimitiveNode; + +// Helper function matching Snowflake's pow10 +constexpr uint64_t Pow10(uint64_t exp) { + uint64_t result = 1; + for (uint64_t i = 0; i < exp; ++i) { + result *= 10; + } + return result; +} + +// Encoding type enum (matching Snowflake's ComprEngine pattern) +enum class EncodingType { + kALP, + kByteStreamSplit, + kZSTD, +}; + +// Helper to create column descriptor for float/double +template +std::shared_ptr MakeColumnDescriptor() { + auto node = PrimitiveNode::Make("column", Repetition::REQUIRED, DType::type_num); + return std::make_shared(node, false, false); +} + +// ============================================================================ +// Benchmark data base class +// ============================================================================ + +/// \brief Helper class to set up encoding benchmark data. +/// +/// Matches Snowflake's RealComprBenchmarkData structure with encoding parameter. +template +struct RealComprBenchmarkData { + std::vector input_uncompressed; + std::shared_ptr encoded_data; + std::vector output_uncompressed; + uint64_t encoded_size = 0; + Encoding::type current_encoding; + std::unique_ptr<::arrow::util::Codec> codec; // For ZSTD + + virtual ~RealComprBenchmarkData() = default; + + void PrepareBenchmarkData(uint64_t element_count, EncodingType encoding_type) { + FillUncompressedInput(element_count); + + using DType = + typename std::conditional::value, FloatType, + DoubleType>::type; + auto descr = MakeColumnDescriptor(); + + // Select encoding based on type + switch (encoding_type) { + case EncodingType::kALP: + current_encoding = Encoding::ALP; + break; + case EncodingType::kByteStreamSplit: + current_encoding = Encoding::BYTE_STREAM_SPLIT; + codec = ::arrow::util::Codec::Create(::arrow::Compression::ZSTD).ValueOrDie(); + break; + case EncodingType::kZSTD: + // ZSTD uses PLAIN encoding + compression + current_encoding = Encoding::PLAIN; + codec = ::arrow::util::Codec::Create(::arrow::Compression::ZSTD).ValueOrDie(); + break; + } + + // Do initial encoding to size buffers + if (encoding_type == EncodingType::kALP) { + auto encoder = MakeTypedEncoder(Encoding::ALP, false, descr.get()); + encoder->Put(input_uncompressed.data(), + static_cast(input_uncompressed.size())); + encoded_data = encoder->FlushValues(); + encoded_size = encoded_data->size(); + } else if (encoding_type == EncodingType::kZSTD) { + // For ZSTD: Plain encode then compress + auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, descr.get()); + encoder->Put(input_uncompressed.data(), + static_cast(input_uncompressed.size())); + auto plain_data = encoder->FlushValues(); + + // Compress with ZSTD - use AllocateBuffer to properly manage memory + int64_t max_compressed_len = + codec->MaxCompressedLen(plain_data->size(), plain_data->data()); + auto compressed_buffer = + ::arrow::AllocateResizableBuffer(max_compressed_len).ValueOrDie(); + int64_t actual_size = + codec + ->Compress(plain_data->size(), plain_data->data(), max_compressed_len, + compressed_buffer->mutable_data()) + .ValueOrDie(); + // Resize to actual compressed size and move to shared_ptr + (void)compressed_buffer->Resize(actual_size); // Resize can't fail for shrinking + encoded_data = std::shared_ptr(std::move(compressed_buffer)); + encoded_size = actual_size; + } else { + // For ByteStreamSplit: Direct encoding + auto encoder = MakeTypedEncoder(current_encoding, false, descr.get()); + encoder->Put(input_uncompressed.data(), + static_cast(input_uncompressed.size())); + auto byte_stream_split_data = encoder->FlushValues(); + // Compress with ZSTD - use AllocateBuffer to properly manage memory + int64_t max_compressed_len = codec->MaxCompressedLen( + byte_stream_split_data->size(), byte_stream_split_data->data()); + auto compressed_buffer = + ::arrow::AllocateResizableBuffer(max_compressed_len).ValueOrDie(); + int64_t actual_size = + codec + ->Compress(byte_stream_split_data->size(), byte_stream_split_data->data(), + max_compressed_len, compressed_buffer->mutable_data()) + .ValueOrDie(); + // Resize to actual compressed size and move to shared_ptr + (void)compressed_buffer->Resize(actual_size); // Resize can't fail for shrinking + encoded_data = std::shared_ptr(std::move(compressed_buffer)); + encoded_size = actual_size; + } + + // Prepare output buffer + output_uncompressed.resize(input_uncompressed.size()); + } + + virtual void FillUncompressedInput(uint64_t element_count) = 0; +}; + +// ============================================================================ +// Synthetic Data Generators +// ============================================================================ + +template +struct ConstantValues : public RealComprBenchmarkData { + void FillUncompressedInput(uint64_t element_count) override { + const T value = static_cast(1.1); + this->input_uncompressed = std::vector(element_count, value); + } +}; + +template +struct IncreasingValues : public RealComprBenchmarkData { + void FillUncompressedInput(uint64_t element_count) override { + this->input_uncompressed.resize(element_count); + T current_value = 0.0; + for (uint64_t i = 0; i < element_count; i++) { + this->input_uncompressed[i] = current_value; + current_value += 1.0; + } + } +}; + +template +struct DecimalSmallRange : public RealComprBenchmarkData { + void FillUncompressedInput(uint64_t element_count) override { + this->input_uncompressed.resize(element_count); + const uint64_t min_val = 100; + const uint64_t max_val = 1000; + const uint64_t decimal_places = 2; + const uint64_t mult = Pow10(decimal_places); + + std::uniform_int_distribution unif(min_val * mult, max_val * mult); + std::default_random_engine re; + for (uint64_t i = 0; i < element_count; i++) { + this->input_uncompressed[i] = unif(re) * 1.0 / mult; + } + } +}; + +template +struct DecimalRange : public RealComprBenchmarkData { + void FillUncompressedInput(uint64_t element_count) override { + this->input_uncompressed.resize(element_count); + const uint64_t min_val = 1000; + const uint64_t max_val = 100000; + const uint64_t decimal_places = 6; + const uint64_t mult = Pow10(decimal_places); + + std::uniform_int_distribution unif(min_val * mult, max_val * mult); + std::default_random_engine re; + for (uint64_t i = 0; i < element_count; i++) { + this->input_uncompressed[i] = unif(re) * 1.0 / mult; + } + } +}; + +template +struct DecimalLargeRange : public RealComprBenchmarkData { + void FillUncompressedInput(uint64_t element_count) override { + this->input_uncompressed.resize(element_count); + const uint64_t min_val = 1000; + const uint64_t max_val = 1000000; + const uint64_t decimal_places = 6; + const uint64_t mult = Pow10(decimal_places); + + std::uniform_int_distribution unif(min_val * mult, max_val * mult); + std::default_random_engine re; + for (uint64_t i = 0; i < element_count; i++) { + this->input_uncompressed[i] = unif(re) * 1.0 / mult; + } + } +}; + +template +struct RandomValues : public RealComprBenchmarkData { + void FillUncompressedInput(uint64_t element_count) override { + this->input_uncompressed.resize(element_count); + std::uniform_real_distribution unif(std::numeric_limits::min(), + std::numeric_limits::max()); + std::default_random_engine re; + for (uint64_t i = 0; i < element_count; i++) { + this->input_uncompressed[i] = unif(re); + } + } +}; + +// ============================================================================ +// CSV Loading Infrastructure (for real-world datasets) +// ============================================================================ + +// Extract tarball once and return the data directory path +std::string GetDataDirectory() { + static std::string data_dir; + static bool initialized = false; + + if (!initialized) { + // Find the tarball location relative to this source file + std::string tarball_path = std::string(__FILE__); + tarball_path = tarball_path.substr(0, tarball_path.find_last_of("/\\")); + tarball_path = tarball_path.substr(0, tarball_path.find_last_of("/\\")); + + tarball_path += "/../submodules/parquet-testing/data/floatingpoint_data.tar.gz"; + + // Use a fixed extraction directory that can be reused across runs + data_dir = "/tmp/parquet_alp_benchmark_data"; + + // Check if tarball exists + std::ifstream tarball_check(tarball_path); + if (!tarball_check.good()) { + // Fall back to original directory if tarball not found + data_dir = std::string(__FILE__); + data_dir = data_dir.substr(0, data_dir.find_last_of("/\\")); + data_dir = data_dir.substr(0, data_dir.find_last_of("/\\")); + data_dir += "/../submodules/parquet-testing/data"; + initialized = true; + return data_dir; + } + + // Check if extraction directory already exists and has files + std::ifstream check_file(data_dir + "/floatingpoint_spotify1.csv"); + if (check_file.good()) { + // Directory already exists with data, reuse it + initialized = true; + return data_dir; + } + + // Create extraction directory and extract tarball + std::string mkdir_cmd = "mkdir -p " + data_dir; + std::string extract_cmd = "tar -xzf " + tarball_path + " -C " + data_dir; + + if (system(mkdir_cmd.c_str()) == 0 && system(extract_cmd.c_str()) == 0) { + initialized = true; + } else { + // Extraction failed, fall back to original directory + data_dir = std::string(__FILE__); + data_dir = data_dir.substr(0, data_dir.find_last_of("/\\")); + data_dir = data_dir.substr(0, data_dir.find_last_of("/\\")); + data_dir += "/../submodules/parquet-testing/data"; + initialized = true; + } + } + + return data_dir; +} + +std::vector SplitCsvRow(const std::string& line, char delimiter = ',') { + std::vector columns; + std::istringstream stream(line); + std::string cell; + + while (std::getline(stream, cell, delimiter)) { + columns.push_back(cell); + } + return columns; +} + +std::vector LoadSpotifyColumn(const std::string& column_name, + const std::string& filename) { + std::vector values; + + static const std::unordered_set kValidFloatColumns = { + "danceability", "energy", "loudness", "speechiness", "acousticness", + "instrumentalness", "liveness", "valence", "tempo"}; + + if (kValidFloatColumns.find(column_name) == kValidFloatColumns.end()) { + std::cerr << "Column '" << column_name << "' is not a supported double column" + << std::endl; + return values; + } + + std::string file_path = GetDataDirectory() + "/" + filename; + + std::ifstream file(file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_path << std::endl; + return values; + } + + std::string file_content((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + file.close(); + + std::istringstream ss(file_content); + std::string line; + size_t column_index = SIZE_MAX; + + if (std::getline(ss, line)) { + std::istringstream header_stream(line); + std::string header; + size_t index = 0; + + while (std::getline(header_stream, header, ',')) { + header.erase(0, header.find_first_not_of(" \t\r\n")); + header.erase(header.find_last_not_of(" \t\r\n") + 1); + + if (header == column_name) { + column_index = index; + break; + } + index++; + } + } + + if (column_index == SIZE_MAX) { + std::cerr << "Column '" << column_name << "' not found in header" << std::endl; + return values; + } + + while (std::getline(ss, line)) { + std::vector columns = SplitCsvRow(line); + if (column_index < columns.size()) { + try { + double value = std::stod(columns[column_index]); + values.push_back(value); + } catch (const std::exception& e) { + // Skip invalid values silently + } + } + } + + return values; +} + +// ============================================================================ +// Real-World Dataset Classes +// ============================================================================ + +template +struct SpotifyData : public RealComprBenchmarkData { + std::string column_name; + + explicit SpotifyData(const std::string& column) : column_name(column) {} + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector spotify_values = + LoadSpotifyColumn(column_name, "floatingpoint_spotify1.csv"); + + this->input_uncompressed.resize(spotify_values.size()); + for (size_t i = 0; i < spotify_values.size(); ++i) { + this->input_uncompressed[i] = static_cast(spotify_values[i]); + } + } +}; + +template +struct SpotifyData2 : public RealComprBenchmarkData { + std::string column_name; + + explicit SpotifyData2(const std::string& column) : column_name(column) {} + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector spotify_values = + LoadSpotifyColumn(column_name, "floatingpoint_spotify2.csv"); + + this->input_uncompressed.resize(spotify_values.size()); + for (size_t i = 0; i < spotify_values.size(); ++i) { + this->input_uncompressed[i] = static_cast(spotify_values[i]); + } + } +}; + +// Load AvgTemperature column from City Temperature CSV data +std::vector LoadCityTemperatureColumn() { + std::vector values; + + std::string file_path = GetDataDirectory() + "/floatingpoint_citytemperature.csv"; + + std::ifstream file(file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_path << std::endl; + return values; + } + + std::string line; + // Skip header line + if (std::getline(file, line)) { + // Process data lines - each line is a single temperature value + while (std::getline(file, line)) { + try { + double value = std::stod(line); + values.push_back(value); + } catch (const std::exception& e) { + // Skip invalid values + continue; + } + } + } + file.close(); + + return values; +} + +// Load any double-point column from POI CSV data +std::vector LoadPoiColumn(const std::string& column_name) { + std::vector values; + + static const std::unordered_set kValidFloatColumns = {"latitude_radian", + "longitude_radian"}; + + if (kValidFloatColumns.find(column_name) == kValidFloatColumns.end()) { + std::cerr << "Column '" << column_name << "' is not a supported double column" + << std::endl; + return values; + } + + std::string file_path = GetDataDirectory() + "/floatingpoint_poi.csv"; + + std::ifstream file(file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_path << std::endl; + return values; + } + + std::string line; + // Read header line to find column index + if (!std::getline(file, line)) { + std::cerr << "Failed to read header from POI CSV" << std::endl; + return values; + } + + std::vector headers = SplitCsvRow(line); + int column_index = -1; + for (size_t i = 0; i < headers.size(); ++i) { + std::string trimmed_header = headers[i]; + trimmed_header.erase(0, trimmed_header.find_first_not_of(" \t\r\n")); + trimmed_header.erase(trimmed_header.find_last_not_of(" \t\r\n") + 1); + + if (trimmed_header == column_name) { + column_index = static_cast(i); + break; + } + } + + if (column_index == -1) { + std::cerr << "Column '" << column_name << "' not found in POI CSV header" + << std::endl; + return values; + } + + // Process data lines + while (std::getline(file, line)) { + std::vector columns = SplitCsvRow(line); + if (columns.size() > static_cast(column_index)) { + try { + double value = std::stod(columns[column_index]); + values.push_back(value); + } catch (const std::exception& e) { + continue; + } + } + } + file.close(); + + return values; +} + +// Load Bird Migration data +std::vector LoadBirdMigrationData() { + std::vector values; + + std::string file_path = GetDataDirectory() + "/floatingpoint_birdmigration.csv"; + + std::ifstream file(file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_path << std::endl; + return values; + } + + std::string line; + // Skip header line + if (!std::getline(file, line)) { + std::cerr << "Failed to read header from bird-migration CSV" << std::endl; + return values; + } + + while (std::getline(file, line)) { + try { + double value = std::stod(line); + values.push_back(value); + } catch (const std::exception& e) { + continue; + } + } + file.close(); + + return values; +} + +// Load Common Government column +std::vector LoadCommonGovernmentColumn(const std::string& column_name) { + std::vector values; + + static const std::unordered_set kValidFloatColumns = {"amount1", "amount2", + "amount3"}; + + if (kValidFloatColumns.find(column_name) == kValidFloatColumns.end()) { + std::cerr << "Column '" << column_name << "' is not a supported double column" + << std::endl; + return values; + } + + size_t column_index = SIZE_MAX; + if (column_name == "amount1") + column_index = 0; + else if (column_name == "amount2") + column_index = 1; + else if (column_name == "amount3") + column_index = 2; + + std::string file_path = GetDataDirectory() + "/floatingpoint_commongovernment.csv"; + + std::ifstream file(file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_path << std::endl; + return values; + } + + std::string line; + while (std::getline(file, line)) { + std::vector columns = SplitCsvRow(line, '|'); + if (column_index < columns.size()) { + try { + double value = std::stod(columns[column_index]); + values.push_back(value); + } catch (const std::exception& e) { + // Skip invalid values + } + } + } + file.close(); + + return values; +} + +// Load Arade column +std::vector LoadAradeColumn(const std::string& column_name) { + std::vector values; + + static const std::unordered_set kValidFloatColumns = {"value1", "value2", + "value3", "value4"}; + + if (kValidFloatColumns.find(column_name) == kValidFloatColumns.end()) { + std::cerr << "Column '" << column_name << "' is not a supported double column" + << std::endl; + return values; + } + + size_t column_index = SIZE_MAX; + if (column_name == "value1") + column_index = 0; + else if (column_name == "value2") + column_index = 1; + else if (column_name == "value3") + column_index = 2; + else if (column_name == "value4") + column_index = 3; + + std::string file_path = GetDataDirectory() + "/floatingpoint_arade.csv"; + + std::ifstream file(file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_path << std::endl; + return values; + } + + std::string line; + while (std::getline(file, line)) { + std::vector columns = SplitCsvRow(line, '|'); + if (column_index < columns.size()) { + try { + double value = std::stod(columns[column_index]); + values.push_back(value); + } catch (const std::exception& e) { + // Skip invalid values + } + } + } + file.close(); + + return values; +} + +// Generic loader for single-column FPC-format CSV files (with header) +std::vector LoadSingleColumnFpcData(const std::string& dataset_name) { + std::vector values; + + std::string file_path = GetDataDirectory() + "/floatingpoint_" + dataset_name + ".csv"; + + std::ifstream file(file_path); + if (!file.is_open()) { + std::cerr << "Failed to open file: " << file_path << std::endl; + return values; + } + + std::string line; + // Skip header line + if (!std::getline(file, line)) { + std::cerr << "Failed to read header from " << dataset_name << " CSV" << std::endl; + return values; + } + + while (std::getline(file, line)) { + try { + double value = std::stod(line); + values.push_back(value); + } catch (const std::exception& e) { + continue; + } + } + file.close(); + + return values; +} + +// Individual loaders for FPC datasets +std::vector LoadNumBrainData() { return LoadSingleColumnFpcData("num_brain"); } +std::vector LoadNumCometData() { return LoadSingleColumnFpcData("num_comet"); } +std::vector LoadNumControlData() { + return LoadSingleColumnFpcData("num_control"); +} +std::vector LoadNumPlasmaData() { return LoadSingleColumnFpcData("num_plasma"); } +std::vector LoadObsErrorData() { return LoadSingleColumnFpcData("obs_error"); } +std::vector LoadObsInfoData() { return LoadSingleColumnFpcData("obs_info"); } +std::vector LoadObsSpitzerData() { + return LoadSingleColumnFpcData("obs_spitzer"); +} +std::vector LoadObsTempData() { return LoadSingleColumnFpcData("obs_temp"); } +std::vector LoadMsgBtData() { return LoadSingleColumnFpcData("msg_bt"); } +std::vector LoadMsgLuData() { return LoadSingleColumnFpcData("msg_lu"); } +std::vector LoadMsgSpData() { return LoadSingleColumnFpcData("msg_sp"); } +std::vector LoadMsgSppmData() { return LoadSingleColumnFpcData("msg_sppm"); } +std::vector LoadMsgSweep3dData() { + return LoadSingleColumnFpcData("msg_sweep3d"); +} + +// Data classes for all additional datasets +template +struct CityTemperatureData : public RealComprBenchmarkData { + CityTemperatureData() = default; + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector values = LoadCityTemperatureColumn(); + this->input_uncompressed.resize(values.size()); + for (size_t i = 0; i < values.size(); ++i) { + this->input_uncompressed[i] = static_cast(values[i]); + } + } +}; + +template +struct PoiData : public RealComprBenchmarkData { + std::string column_name; + + explicit PoiData(const std::string& column) : column_name(column) {} + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector values = LoadPoiColumn(column_name); + this->input_uncompressed.resize(values.size()); + for (size_t i = 0; i < values.size(); ++i) { + this->input_uncompressed[i] = static_cast(values[i]); + } + } +}; + +template +struct BirdMigrationData : public RealComprBenchmarkData { + explicit BirdMigrationData() {} + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector values = LoadBirdMigrationData(); + this->input_uncompressed.resize(values.size()); + for (size_t i = 0; i < values.size(); ++i) { + this->input_uncompressed[i] = static_cast(values[i]); + } + } +}; + +template +struct CommonGovernmentData : public RealComprBenchmarkData { + std::string column_name; + + explicit CommonGovernmentData(const std::string& column) : column_name(column) {} + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector values = LoadCommonGovernmentColumn(column_name); + this->input_uncompressed.resize(values.size()); + for (size_t i = 0; i < values.size(); ++i) { + this->input_uncompressed[i] = static_cast(values[i]); + } + } +}; + +template +struct AradeData : public RealComprBenchmarkData { + std::string column_name; + + explicit AradeData(const std::string& column) : column_name(column) {} + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector values = LoadAradeColumn(column_name); + this->input_uncompressed.resize(values.size()); + for (size_t i = 0; i < values.size(); ++i) { + this->input_uncompressed[i] = static_cast(values[i]); + } + } +}; + +// Generic template for FPC single-column datasets +template (*LoaderFunc)()> +struct FpcDataset : public RealComprBenchmarkData { + explicit FpcDataset() {} + + void FillUncompressedInput(uint64_t /*element_count*/) override { + std::vector values = LoaderFunc(); + this->input_uncompressed.resize(values.size()); + for (size_t i = 0; i < values.size(); ++i) { + this->input_uncompressed[i] = static_cast(values[i]); + } + } +}; + +// Type aliases for each FPC dataset +template +using NumBrainData = FpcDataset; +template +using NumCometData = FpcDataset; +template +using NumControlData = FpcDataset; +template +using NumPlasmaData = FpcDataset; +template +using ObsErrorData = FpcDataset; +template +using ObsInfoData = FpcDataset; +template +using ObsSpitzerData = FpcDataset; +template +using ObsTempData = FpcDataset; +template +using MsgBtData = FpcDataset; +template +using MsgLuData = FpcDataset; +template +using MsgSpData = FpcDataset; +template +using MsgSppmData = FpcDataset; +template +using MsgSweep3dData = FpcDataset; + +// ============================================================================ +// Benchmark Fixture (matching Snowflake's DoubleBenchmark structure) +// ============================================================================ + +template +class DoubleBenchmark : public benchmark::Fixture { + public: + static constexpr uint64_t kElementCount = 50000; // Matches Snowflake exactly + + void Setup(std::unique_ptr> bd, uint64_t element_count, + EncodingType encoding_type) { + encoding_type_ = encoding_type; + bd_ = std::move(bd); + bd_->PrepareBenchmarkData(element_count, encoding_type); + } + + void VerifyDataCompress() { + Decompress(); + if (memcmp(bd_->input_uncompressed.data(), bd_->output_uncompressed.data(), + bd_->input_uncompressed.size() * sizeof(T)) != 0) { + std::cerr << "verificationFailed" << std::endl; + } + } + + void VerifyDataDecompress() { + if (memcmp(bd_->input_uncompressed.data(), bd_->output_uncompressed.data(), + bd_->input_uncompressed.size() * sizeof(T)) != 0) { + std::cerr << "verificationFailed" << std::endl; + } + } + + void Compress() { + using DType = + typename std::conditional::value, FloatType, + DoubleType>::type; + auto descr = MakeColumnDescriptor(); + + if (encoding_type_ == EncodingType::kALP) { + auto encoder = MakeTypedEncoder(Encoding::ALP, false, descr.get()); + encoder->Put(bd_->input_uncompressed.data(), + static_cast(bd_->input_uncompressed.size())); + bd_->encoded_data = encoder->FlushValues(); + bd_->encoded_size = bd_->encoded_data->size(); + } else if (encoding_type_ == EncodingType::kZSTD) { + // For ZSTD: Plain encode then compress + auto encoder = MakeTypedEncoder(Encoding::PLAIN, false, descr.get()); + encoder->Put(bd_->input_uncompressed.data(), + static_cast(bd_->input_uncompressed.size())); + auto plain_data = encoder->FlushValues(); + + // Compress with ZSTD - use AllocateBuffer to properly manage memory + int64_t max_compressed_len = + bd_->codec->MaxCompressedLen(plain_data->size(), plain_data->data()); + auto compressed_buffer = + ::arrow::AllocateResizableBuffer(max_compressed_len).ValueOrDie(); + int64_t actual_size = + bd_->codec + ->Compress(plain_data->size(), plain_data->data(), max_compressed_len, + compressed_buffer->mutable_data()) + .ValueOrDie(); + // Resize to actual compressed size and move to shared_ptr + (void)compressed_buffer->Resize(actual_size); // Resize can't fail for shrinking + bd_->encoded_data = std::shared_ptr(std::move(compressed_buffer)); + bd_->encoded_size = actual_size; + } else { + // For ByteStreamSplit: Direct encoding + auto encoder = MakeTypedEncoder(bd_->current_encoding, false, descr.get()); + encoder->Put(bd_->input_uncompressed.data(), + static_cast(bd_->input_uncompressed.size())); + auto byte_stream_split_data = encoder->FlushValues(); + // Compress with ZSTD - use AllocateBuffer to properly manage memory + int64_t max_compressed_len = bd_->codec->MaxCompressedLen( + byte_stream_split_data->size(), byte_stream_split_data->data()); + auto compressed_buffer = + ::arrow::AllocateResizableBuffer(max_compressed_len).ValueOrDie(); + int64_t actual_size = + bd_->codec + ->Compress(byte_stream_split_data->size(), byte_stream_split_data->data(), + max_compressed_len, compressed_buffer->mutable_data()) + .ValueOrDie(); + // Resize to actual compressed size and move to shared_ptr + (void)compressed_buffer->Resize(actual_size); // Resize can't fail for shrinking + bd_->encoded_data = std::shared_ptr(std::move(compressed_buffer)); + bd_->encoded_size = actual_size; + } + } + + void Decompress() { + using DType = + typename std::conditional::value, FloatType, + DoubleType>::type; + auto descr = MakeColumnDescriptor(); + + if (encoding_type_ == EncodingType::kALP) { + // For ALP: Use Parquet decoder + auto decoder = MakeTypedDecoder(Encoding::ALP, descr.get()); + decoder->SetData(static_cast(bd_->input_uncompressed.size()), + bd_->encoded_data->data(), + static_cast(bd_->encoded_data->size())); + decoder->Decode(bd_->output_uncompressed.data(), + static_cast(bd_->output_uncompressed.size())); + } else if (encoding_type_ == EncodingType::kZSTD) { + // For ZSTD: Decompress then plain decode + int64_t decompressed_len = bd_->input_uncompressed.size() * sizeof(T); + std::vector decompressed(decompressed_len); + int64_t actual_size = + bd_->codec + ->Decompress(bd_->encoded_data->size(), bd_->encoded_data->data(), + decompressed_len, decompressed.data()) + .ValueOrDie(); + + // Plain decode + auto decoder = MakeTypedDecoder(Encoding::PLAIN, descr.get()); + decoder->SetData(static_cast(bd_->input_uncompressed.size()), + decompressed.data(), static_cast(actual_size)); + decoder->Decode(bd_->output_uncompressed.data(), + static_cast(bd_->output_uncompressed.size())); + } else { + int64_t decompressed_len = bd_->input_uncompressed.size() * sizeof(T); + std::vector decompressed(decompressed_len); + int64_t actual_size = + bd_->codec + ->Decompress(bd_->encoded_data->size(), bd_->encoded_data->data(), + decompressed_len, decompressed.data()) + .ValueOrDie(); + + // For ByteStreamSplit: Direct decoding + auto decoder = MakeTypedDecoder(bd_->current_encoding, descr.get()); + decoder->SetData(static_cast(bd_->input_uncompressed.size()), + decompressed.data(), static_cast(actual_size)); + decoder->Decode(bd_->output_uncompressed.data(), + static_cast(bd_->output_uncompressed.size())); + } + } + + void BenchmarkCompress(benchmark::State& state, + std::unique_ptr> bd, + EncodingType encoding_type) { + Setup(std::move(bd), kElementCount, encoding_type); + + uint64_t iteration_count = 0; + auto start = std::chrono::high_resolution_clock::now(); + for (auto _ : state) { + Compress(); + iteration_count++; + } + auto end = std::chrono::high_resolution_clock::now(); + const uint64_t overall_time_us = + std::chrono::duration_cast(end - start).count(); + + state.counters["MB/s"] = + static_cast(bd_->input_uncompressed.size() * sizeof(T) * + iteration_count) / + (overall_time_us); + + VerifyDataCompress(); + state.counters["Compression Ratio Percent"] = + 0.64 * + (100 * bd_->encoded_size / (1.0 * bd_->input_uncompressed.size() * sizeof(T))); + } + + void BenchmarkDecompress(benchmark::State& state, + std::unique_ptr> bd, + EncodingType encoding_type) { + Setup(std::move(bd), kElementCount, encoding_type); + + uint64_t iteration_count = 0; + auto start = std::chrono::high_resolution_clock::now(); + for (auto _ : state) { + Decompress(); + iteration_count++; + } + auto end = std::chrono::high_resolution_clock::now(); + const uint64_t overall_time_us = + std::chrono::duration_cast(end - start).count(); + + state.counters["MB/s"] = + static_cast(bd_->input_uncompressed.size() * sizeof(T) * + iteration_count) / + (overall_time_us); + + VerifyDataDecompress(); + } + + std::unique_ptr> bd_; + EncodingType encoding_type_; +}; + +// ============================================================================ +// Column Lists (matching Snowflake's pattern) +// ============================================================================ + +#define COLUMN_LIST \ + X(Valence, "valence") \ + X(Acousticness, "acousticness") \ + X(Danceability, "danceability") \ + X(Energy, "energy") \ + X(Instrumentalness, "instrumentalness")\ + X(Liveness, "liveness") \ + X(Loudness, "loudness") \ + X(Tempo, "tempo") \ + X(Speechiness, "speechiness") + +// For new dataset (Spotify2), we need lowercase identifiers +#define COLUMN_LIST_NEW \ + X(valence) \ + X(acousticness) \ + X(danceability) \ + X(energy) \ + X(instrumentalness) \ + X(liveness) \ + X(loudness) \ + X(tempo) \ + X(speechiness) + +// POI dataset columns +#define POI_COLUMN_LIST \ + X(LatitudeRadian, "latitude_radian") \ + X(LongitudeRadian, "longitude_radian") + +// Common Government dataset columns +#define COMMON_GOVERNMENT_COLUMN_LIST \ + X(Amount1, "amount1") \ + X(Amount2, "amount2") \ + X(Amount3, "amount3") + +// Arade dataset columns +#define ARADE_COLUMN_LIST \ + X(Value1, "value1") \ + X(Value2, "value2") \ + X(Value3, "value3") \ + X(Value4, "value4") + +// Algorithm list for all benchmarks (matching Snowflake's pattern) +#define ALGORITHM_LIST \ + X(ALP, kALP) \ + X(BYTESTREAMSPLIT, kByteStreamSplit) \ + X(ZSTD, kZSTD) + +// ============================================================================ +// Benchmark Generation Macros (matching Snowflake's pattern) +// ============================================================================ + +// Synthetic data benchmark macros +#define BENCHMARK_SYNTHETIC_COMPRESS(ALGO, NAME, CLASS, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##NAME##Float, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_SYNTHETIC_DECOMPRESS(ALGO, NAME, CLASS, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##NAME##Float, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +// Original Spotify dataset (Dataset 1) benchmark macros +#define BENCHMARK_ORIGINAL_DATASET_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##Spotify##COLUMN_CAP##Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_ORIGINAL_DATASET_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, \ + ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, \ + ALGO##decompress##Spotify##COLUMN_CAP##Float, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +// New Spotify dataset (Dataset 2) benchmark macros +#define BENCHMARK_NEW_DATASET_COMPRESS(ALGO, COLUMN, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##Spotify##COLUMN##2Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(#COLUMN)), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NEW_DATASET_DECOMPRESS(ALGO, COLUMN, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##Spotify##COLUMN##2Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(#COLUMN)), \ + EncodingType::ENGINE); \ + } + +// City Temperature dataset benchmark macros +#define BENCHMARK_CITY_TEMP_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##CityTemperatureFloat, \ + double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_CITY_TEMP_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##CityTemperatureFloat, \ + double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +// POI dataset benchmark macros +#define BENCHMARK_POI_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##Poi##COLUMN_CAP##Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_POI_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##Poi##COLUMN_CAP##Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +// Bird Migration dataset benchmark macros +#define BENCHMARK_BIRD_MIGRATION_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##BirdMigrationFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_BIRD_MIGRATION_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##BirdMigrationFloat, \ + double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +// Common Government dataset benchmark macros +#define BENCHMARK_COMMON_GOVERNMENT_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, \ + ALGO##compress##CommonGovernment##COLUMN_CAP##Float, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_COMMON_GOVERNMENT_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, \ + ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, \ + ALGO##decompress##CommonGovernment##COLUMN_CAP##Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +// Arade dataset benchmark macros +#define BENCHMARK_ARADE_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##Arade##COLUMN_CAP##Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_ARADE_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##Arade##COLUMN_CAP##Float, \ + double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>(COLUMN_LOWER)), \ + EncodingType::ENGINE); \ + } + +// FPC dataset benchmark macros (generic for single-column datasets) +#define BENCHMARK_NUM_BRAIN_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##NumBrainFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NUM_BRAIN_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##NumBrainFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NUM_COMET_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##NumCometFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NUM_COMET_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##NumCometFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NUM_CONTROL_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##NumControlFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NUM_CONTROL_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##NumControlFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NUM_PLASMA_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##NumPlasmaFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_NUM_PLASMA_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##NumPlasmaFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_ERROR_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##ObsErrorFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_ERROR_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##ObsErrorFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_INFO_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##ObsInfoFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_INFO_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##ObsInfoFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_SPITZER_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##ObsSpitzerFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_SPITZER_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##ObsSpitzerFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_TEMP_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##ObsTempFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_OBS_TEMP_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##ObsTempFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_BT_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##MsgBtFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_BT_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##MsgBtFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_LU_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##MsgLuFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_LU_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##MsgLuFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_SP_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##MsgSpFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_SP_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##MsgSpFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_SPPM_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##MsgSppmFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_SPPM_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##MsgSppmFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_SWEEP3D_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##compress##MsgSweep3dFloat, double) \ + (benchmark::State & state) { \ + BenchmarkCompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +#define BENCHMARK_MSG_SWEEP3D_DECOMPRESS(ALGO, ENGINE) \ + BENCHMARK_TEMPLATE_F(DoubleBenchmark, ALGO##decompress##MsgSweep3dFloat, double) \ + (benchmark::State & state) { \ + BenchmarkDecompress( \ + state, \ + std::unique_ptr>( \ + std::make_unique>()), \ + EncodingType::ENGINE); \ + } + +// ============================================================================ +// Benchmark Registrations - Synthetic Data (All Algorithms) +// COMMENTED OUT - Using only real-world Spotify data +// ============================================================================ + +#if 0 +#define GENERATE_SYNTHETIC_BENCHMARKS(ALGO, ENGINE) \ + BENCHMARK_SYNTHETIC_COMPRESS(ALGO, Constant, ConstantValues, ENGINE) \ + BENCHMARK_SYNTHETIC_DECOMPRESS(ALGO, Constant, ConstantValues, ENGINE) \ + BENCHMARK_SYNTHETIC_COMPRESS(ALGO, Increasing, IncreasingValues, ENGINE) \ + BENCHMARK_SYNTHETIC_DECOMPRESS(ALGO, Increasing, IncreasingValues, ENGINE) \ + BENCHMARK_SYNTHETIC_COMPRESS(ALGO, SmallRange, DecimalSmallRange, ENGINE) \ + BENCHMARK_SYNTHETIC_DECOMPRESS(ALGO, SmallRange, DecimalSmallRange, ENGINE) \ + BENCHMARK_SYNTHETIC_COMPRESS(ALGO, Range, DecimalRange, ENGINE) \ + BENCHMARK_SYNTHETIC_DECOMPRESS(ALGO, Range, DecimalRange, ENGINE) \ + BENCHMARK_SYNTHETIC_COMPRESS(ALGO, LargeRange, DecimalLargeRange, ENGINE) \ + BENCHMARK_SYNTHETIC_DECOMPRESS(ALGO, LargeRange, DecimalLargeRange, ENGINE) \ + BENCHMARK_SYNTHETIC_COMPRESS(ALGO, Random, RandomValues, ENGINE) \ + BENCHMARK_SYNTHETIC_DECOMPRESS(ALGO, Random, RandomValues, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_SYNTHETIC_BENCHMARKS(ALGO, ENGINE) +ALGORITHM_LIST +#undef X +#endif + +// ============================================================================ +// Benchmark Registrations - Spotify Dataset 1 (All Algorithms x 9 columns) +// ============================================================================ + +#define GENERATE_SPOTIFY_BENCHMARKS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_ORIGINAL_DATASET_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_ORIGINAL_DATASET_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) + +#define GENERATE_ALGORITHM_FOR_SPOTIFY(ALGO, ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Valence, "valence", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Acousticness, "acousticness", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Danceability, "danceability", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Energy, "energy", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Instrumentalness, "instrumentalness", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Liveness, "liveness", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Loudness, "loudness", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Tempo, "tempo", ENGINE) \ + GENERATE_SPOTIFY_BENCHMARKS(ALGO, Speechiness, "speechiness", ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_SPOTIFY(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ============================================================================ +// Benchmark Registrations - Spotify Dataset 2 (All Algorithms x 9 columns) +// ============================================================================ + +#define GENERATE_SPOTIFY2_BENCHMARKS(ALGO, COLUMN, ENGINE) \ + BENCHMARK_NEW_DATASET_COMPRESS(ALGO, COLUMN, ENGINE) \ + BENCHMARK_NEW_DATASET_DECOMPRESS(ALGO, COLUMN, ENGINE) + +#define GENERATE_ALGORITHM_FOR_SPOTIFY2(ALGO, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, valence, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, acousticness, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, danceability, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, energy, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, instrumentalness, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, liveness, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, loudness, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, tempo, ENGINE) \ + GENERATE_SPOTIFY2_BENCHMARKS(ALGO, speechiness, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_SPOTIFY2(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ============================================================================ +// Benchmark Registrations - City Temperature Dataset (1 column x 3 algorithms) +// ============================================================================ + +#define GENERATE_ALGORITHM_FOR_CITY_TEMP(ALGO, ENGINE) \ + BENCHMARK_CITY_TEMP_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_CITY_TEMP_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_CITY_TEMP(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ============================================================================ +// Benchmark Registrations - POI Dataset (2 columns x 3 algorithms) +// ============================================================================ + +#define GENERATE_ALGORITHM_FOR_POI(COLUMN_CAP, COLUMN_LOWER, ALGO, ENGINE) \ + BENCHMARK_POI_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_POI_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) + +#define GENERATE_ALGORITHMS_FOR_POI_COLUMN(COLUMN_CAP, COLUMN_LOWER) \ + GENERATE_ALGORITHM_FOR_POI(COLUMN_CAP, COLUMN_LOWER, ALP, kALP) \ + GENERATE_ALGORITHM_FOR_POI(COLUMN_CAP, COLUMN_LOWER, BYTESTREAMSPLIT, \ + kByteStreamSplit) \ + GENERATE_ALGORITHM_FOR_POI(COLUMN_CAP, COLUMN_LOWER, ZSTD, kZSTD) + +#define X(COLUMN_CAP, COLUMN_LOWER) \ + GENERATE_ALGORITHMS_FOR_POI_COLUMN(COLUMN_CAP, COLUMN_LOWER) +POI_COLUMN_LIST +#undef X + +// ============================================================================ +// Benchmark Registrations - Bird Migration Dataset (1 column x 3 algorithms) +// ============================================================================ + +#define GENERATE_ALGORITHM_FOR_BIRD_MIGRATION(ALGO, ENGINE) \ + BENCHMARK_BIRD_MIGRATION_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_BIRD_MIGRATION_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_BIRD_MIGRATION(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ============================================================================ +// Benchmark Registrations - Common Government Dataset (3 columns x 3 algorithms) +// ============================================================================ + +#define GENERATE_ALGORITHM_FOR_COMMON_GOVERNMENT(COLUMN_CAP, COLUMN_LOWER, ALGO, \ + ENGINE) \ + BENCHMARK_COMMON_GOVERNMENT_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_COMMON_GOVERNMENT_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) + +#define GENERATE_ALGORITHMS_FOR_COMMON_GOVERNMENT_COLUMN(COLUMN_CAP, COLUMN_LOWER) \ + GENERATE_ALGORITHM_FOR_COMMON_GOVERNMENT(COLUMN_CAP, COLUMN_LOWER, ALP, kALP) \ + GENERATE_ALGORITHM_FOR_COMMON_GOVERNMENT(COLUMN_CAP, COLUMN_LOWER, \ + BYTESTREAMSPLIT, kByteStreamSplit) \ + GENERATE_ALGORITHM_FOR_COMMON_GOVERNMENT(COLUMN_CAP, COLUMN_LOWER, ZSTD, kZSTD) + +#define X(COLUMN_CAP, COLUMN_LOWER) \ + GENERATE_ALGORITHMS_FOR_COMMON_GOVERNMENT_COLUMN(COLUMN_CAP, COLUMN_LOWER) +COMMON_GOVERNMENT_COLUMN_LIST +#undef X + +// ============================================================================ +// Benchmark Registrations - Arade Dataset (4 columns x 3 algorithms) +// ============================================================================ + +#define GENERATE_ALGORITHM_FOR_ARADE(COLUMN_CAP, COLUMN_LOWER, ALGO, ENGINE) \ + BENCHMARK_ARADE_COMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) \ + BENCHMARK_ARADE_DECOMPRESS(ALGO, COLUMN_CAP, COLUMN_LOWER, ENGINE) + +#define GENERATE_ALGORITHMS_FOR_ARADE_COLUMN(COLUMN_CAP, COLUMN_LOWER) \ + GENERATE_ALGORITHM_FOR_ARADE(COLUMN_CAP, COLUMN_LOWER, ALP, kALP) \ + GENERATE_ALGORITHM_FOR_ARADE(COLUMN_CAP, COLUMN_LOWER, BYTESTREAMSPLIT, \ + kByteStreamSplit) \ + GENERATE_ALGORITHM_FOR_ARADE(COLUMN_CAP, COLUMN_LOWER, ZSTD, kZSTD) + +#define X(COLUMN_CAP, COLUMN_LOWER) \ + GENERATE_ALGORITHMS_FOR_ARADE_COLUMN(COLUMN_CAP, COLUMN_LOWER) +ARADE_COLUMN_LIST +#undef X + +// ============================================================================ +// Benchmark Registrations - FPC Datasets (13 single-column datasets x 3 each) +// ============================================================================ + +// NumBrain dataset +#define GENERATE_ALGORITHM_FOR_NUM_BRAIN(ALGO, ENGINE) \ + BENCHMARK_NUM_BRAIN_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_NUM_BRAIN_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_NUM_BRAIN(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// NumComet dataset +#define GENERATE_ALGORITHM_FOR_NUM_COMET(ALGO, ENGINE) \ + BENCHMARK_NUM_COMET_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_NUM_COMET_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_NUM_COMET(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// NumControl dataset +#define GENERATE_ALGORITHM_FOR_NUM_CONTROL(ALGO, ENGINE) \ + BENCHMARK_NUM_CONTROL_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_NUM_CONTROL_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_NUM_CONTROL(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// NumPlasma dataset +#define GENERATE_ALGORITHM_FOR_NUM_PLASMA(ALGO, ENGINE) \ + BENCHMARK_NUM_PLASMA_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_NUM_PLASMA_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_NUM_PLASMA(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ObsError dataset +#define GENERATE_ALGORITHM_FOR_OBS_ERROR(ALGO, ENGINE) \ + BENCHMARK_OBS_ERROR_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_OBS_ERROR_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_OBS_ERROR(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ObsInfo dataset +#define GENERATE_ALGORITHM_FOR_OBS_INFO(ALGO, ENGINE) \ + BENCHMARK_OBS_INFO_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_OBS_INFO_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_OBS_INFO(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ObsSpitzer dataset +#define GENERATE_ALGORITHM_FOR_OBS_SPITZER(ALGO, ENGINE) \ + BENCHMARK_OBS_SPITZER_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_OBS_SPITZER_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_OBS_SPITZER(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// ObsTemp dataset +#define GENERATE_ALGORITHM_FOR_OBS_TEMP(ALGO, ENGINE) \ + BENCHMARK_OBS_TEMP_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_OBS_TEMP_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_OBS_TEMP(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// MsgBt dataset +#define GENERATE_ALGORITHM_FOR_MSG_BT(ALGO, ENGINE) \ + BENCHMARK_MSG_BT_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_MSG_BT_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_MSG_BT(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// MsgLu dataset +#define GENERATE_ALGORITHM_FOR_MSG_LU(ALGO, ENGINE) \ + BENCHMARK_MSG_LU_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_MSG_LU_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_MSG_LU(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// MsgSp dataset +#define GENERATE_ALGORITHM_FOR_MSG_SP(ALGO, ENGINE) \ + BENCHMARK_MSG_SP_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_MSG_SP_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_MSG_SP(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// MsgSppm dataset +#define GENERATE_ALGORITHM_FOR_MSG_SPPM(ALGO, ENGINE) \ + BENCHMARK_MSG_SPPM_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_MSG_SPPM_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_MSG_SPPM(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +// MsgSweep3d dataset +#define GENERATE_ALGORITHM_FOR_MSG_SWEEP3D(ALGO, ENGINE) \ + BENCHMARK_MSG_SWEEP3D_COMPRESS(ALGO, ENGINE) \ + BENCHMARK_MSG_SWEEP3D_DECOMPRESS(ALGO, ENGINE) + +#define X(ALGO, ENGINE) GENERATE_ALGORITHM_FOR_MSG_SWEEP3D(ALGO, ENGINE) +ALGORITHM_LIST +#undef X + +} // namespace parquet + +BENCHMARK_MAIN(); diff --git a/cpp/src/parquet/encoding_benchmark.cc b/cpp/src/parquet/encoding_benchmark.cc index bea1a5807a2..48ee0558567 100644 --- a/cpp/src/parquet/encoding_benchmark.cc +++ b/cpp/src/parquet/encoding_benchmark.cc @@ -661,6 +661,78 @@ BENCHMARK(BM_ByteStreamSplitEncode_Float_Neon)->Apply(ByteStreamSplitApply); BENCHMARK(BM_ByteStreamSplitEncode_Double_Neon)->Apply(ByteStreamSplitApply); #endif +// ---------------------------------------------------------------------- +// ALP encoding/decoding benchmarks + +static void BM_AlpEncodingFloat(benchmark::State& state) { + std::vector values(state.range(0), 64.0f); + auto encoder = MakeTypedEncoder(Encoding::ALP); + for (auto _ : state) { + encoder->Put(values.data(), static_cast(values.size())); + encoder->FlushValues(); + } + state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(float)); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +BENCHMARK(BM_AlpEncodingFloat)->Range(MIN_RANGE, MAX_RANGE); + +static void BM_AlpDecodingFloat(benchmark::State& state) { + std::vector values(state.range(0), 64.0f); + auto encoder = MakeTypedEncoder(Encoding::ALP); + encoder->Put(values.data(), static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); + + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::ALP); + decoder->SetData(static_cast(values.size()), buf->data(), + static_cast(buf->size())); + std::vector output(values.size()); + decoder->Decode(output.data(), static_cast(values.size())); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(float)); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +BENCHMARK(BM_AlpDecodingFloat)->Range(MIN_RANGE, MAX_RANGE); + +static void BM_AlpEncodingDouble(benchmark::State& state) { + std::vector values(state.range(0), 64.0); + auto encoder = MakeTypedEncoder(Encoding::ALP); + for (auto _ : state) { + encoder->Put(values.data(), static_cast(values.size())); + encoder->FlushValues(); + } + state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(double)); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +BENCHMARK(BM_AlpEncodingDouble)->Range(MIN_RANGE, MAX_RANGE); + +static void BM_AlpDecodingDouble(benchmark::State& state) { + std::vector values(state.range(0), 64.0); + auto encoder = MakeTypedEncoder(Encoding::ALP); + encoder->Put(values.data(), static_cast(values.size())); + std::shared_ptr buf = encoder->FlushValues(); + + for (auto _ : state) { + auto decoder = MakeTypedDecoder(Encoding::ALP); + decoder->SetData(static_cast(values.size()), buf->data(), + static_cast(buf->size())); + std::vector output(values.size()); + decoder->Decode(output.data(), static_cast(values.size())); + benchmark::ClobberMemory(); + } + state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(double)); + state.SetItemsProcessed(state.iterations() * state.range(0)); +} + +BENCHMARK(BM_AlpDecodingDouble)->Range(MIN_RANGE, MAX_RANGE); + +// ---------------------------------------------------------------------- +// DeltaBitPacking encoding/decoding benchmarks + template static auto MakeDeltaBitPackingInputFixed(size_t length) { using T = typename DType::c_type; diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc index fb4eb92a754..575d7e65726 100644 --- a/cpp/src/parquet/types.cc +++ b/cpp/src/parquet/types.cc @@ -259,6 +259,8 @@ std::string EncodingToString(Encoding::type t) { return "RLE_DICTIONARY"; case Encoding::BYTE_STREAM_SPLIT: return "BYTE_STREAM_SPLIT"; + case Encoding::ALP: + return "ALP"; default: return "UNKNOWN"; } diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 7e8a18fc94d..ef64aa7f323 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -538,8 +538,9 @@ struct Encoding { DELTA_BYTE_ARRAY = 7, RLE_DICTIONARY = 8, BYTE_STREAM_SPLIT = 9, + ALP = 10, // Should always be last element (except UNKNOWN) - UNDEFINED = 10, + UNDEFINED = 11, UNKNOWN = 999 }; }; diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index a3d96a65e11..66dfde8b2a5 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3 +Subproject commit 66dfde8b2a569e7cbc8e998153e8dd6f2b36f940