From 0ebcdb73e4786ad120fdea1ab0ed7d7250006ca0 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Fri, 20 Jun 2025 09:42:00 +0000 Subject: [PATCH] Merge pull request #82114 from ClickHouse/fix_parser_for_complex_types Fix support for nested data types with decimal subfields in glue catalog --- src/Databases/DataLake/GlueCatalog.cpp | 22 +++-- tests/integration/test_database_glue/test.py | 88 +++++++++++++++++--- 2 files changed, 91 insertions(+), 19 deletions(-) diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 01fe88d0db8d..a8a5c2e58699 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -64,20 +64,28 @@ String trim(const String & str) std::vector splitTypeArguments(const String & type_str) { std::vector args; - int depth = 0; + int angle_depth = 0; + int paren_depth = 0; size_t start = 0; - for (size_t i = 0; i < type_str.size(); i++) + + for (size_t i = 0; i < type_str.size(); ++i) { - if (type_str[i] == '<') - depth++; - else if (type_str[i] == '>') - depth--; - else if (type_str[i] == ',' && depth == 0) + char c = type_str[i]; + if (c == '<') + angle_depth++; + else if (c == '>') + angle_depth--; + else if (c == '(') + paren_depth++; + else if (c == ')') + paren_depth--; + else if (c == ',' && angle_depth == 0 && paren_depth == 0) { args.push_back(trim(type_str.substr(start, i - start))); start = i + 1; } } + args.push_back(trim(type_str.substr(start))); return args; } diff --git a/tests/integration/test_database_glue/test.py b/tests/integration/test_database_glue/test.py index 7809b67c9c54..32c16db7a6ea 100644 --- a/tests/integration/test_database_glue/test.py +++ b/tests/integration/test_database_glue/test.py @@ -12,19 +12,23 @@ import pytest import requests import urllib3 +from datetime import datetime, timedelta from minio import Minio from pyiceberg.catalog import load_catalog from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table.sorting import SortField, SortOrder from pyiceberg.transforms import DayTransform, IdentityTransform +from helpers.config_cluster import minio_access_key, minio_secret_key +import decimal from pyiceberg.types import ( DoubleType, - FloatType, NestedField, StringType, StructType, TimestampType, + MapType, + DecimalType, ) from helpers.cluster import ClickHouseCluster, ClickHouseInstance, is_arm @@ -39,6 +43,11 @@ BASE_URL = "http://glue:3000" BASE_URL_LOCAL_HOST = "http://localhost:3000" +def generate_decimal(precision=9, scale=2): + max_value = 10**(precision - scale) - 1 + value = random.uniform(0, max_value) + return round(decimal.Decimal(value), scale) + DEFAULT_SCHEMA = Schema( NestedField( field_id=1, name="datetime", field_type=TimestampType(), required=False @@ -59,9 +68,21 @@ ), required=False, ), + NestedField( + field_id=6, + name="map_string_decimal", + field_type=MapType( + key_type=StringType(), + value_type=DecimalType(9, 2), + key_id=7, + value_id=8, + value_required=False, + ), + required=False, + ), ) -DEFAULT_CREATE_TABLE = "CREATE TABLE {}.`{}.{}`\\n(\\n `datetime` Nullable(DateTime64(6)),\\n `symbol` Nullable(String),\\n `bid` Nullable(Float64),\\n `ask` Nullable(Float64),\\n `details` Tuple(created_by Nullable(String))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n" +DEFAULT_CREATE_TABLE = "CREATE TABLE {}.`{}.{}`\\n(\\n `datetime` Nullable(DateTime64(6)),\\n `symbol` Nullable(String),\\n `bid` Nullable(Float64),\\n `ask` Nullable(Float64),\\n `details` Tuple(created_by Nullable(String)),\\n `map_string_decimal` Map(String, Nullable(Decimal(9, 2)))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse-glue/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n" DEFAULT_PARTITION_SPEC = PartitionSpec( PartitionField( @@ -107,15 +128,59 @@ def create_table( ) -def generate_record(): - return { - "datetime": datetime.now(), - "symbol": str("kek"), - "bid": round(random.uniform(100, 200), 2), - "ask": round(random.uniform(200, 300), 2), - "details": {"created_by": "Alice Smith"}, - } +def generate_arrow_data(num_rows=5): + datetimes = [] + symbols = [] + bids = [] + asks = [] + details_created_by = [] + map_keys = [] + map_values = [] + + offsets = [0] + + for _ in range(num_rows): + datetimes.append(datetime.utcnow() - timedelta(minutes=random.randint(0, 60))) + symbols.append(random.choice(["AAPL", "GOOG", "MSFT"])) + bids.append(random.uniform(100, 150)) + asks.append(random.uniform(150, 200)) + details_created_by.append(random.choice(["alice", "bob", "carol"])) + + # map + keys = [] + values = [] + for i in range(random.randint(1, 3)): + keys.append(f"key{i}") + values.append(generate_decimal()) + map_keys.extend(keys) + map_values.extend(values) + offsets.append(offsets[-1] + len(keys)) + + # Struct for 'details' + struct_array = pa.StructArray.from_arrays( + [pa.array(details_created_by, type=pa.string())], + names=["created_by"] + ) + + # Map array + map_array = pa.MapArray.from_arrays( + offsets=pa.array(offsets, type=pa.int32()), + keys=pa.array(map_keys, type=pa.string()), + items=pa.array(map_values, type=pa.decimal128(9, 2)) + ) + + # Final table + table = pa.table({ + "datetime": pa.array(datetimes, type=pa.timestamp("us")), + "symbol": pa.array(symbols, type=pa.string()), + "bid": pa.array(bids, type=pa.float64()), + "ask": pa.array(asks, type=pa.float64()), + "details": struct_array, + "map_string_decimal": map_array, + }) + + return table def create_clickhouse_glue_database( started_cluster, node, name, additional_settings={} @@ -259,8 +324,7 @@ def test_select(started_cluster): table = create_table(catalog, namespace, table_name) num_rows = 10 - data = [generate_record() for _ in range(num_rows)] - df = pa.Table.from_pylist(data) + df = generate_arrow_data(num_rows) table.append(df) create_clickhouse_glue_database(started_cluster, node, CATALOG_NAME)