From 48191496bb0bf84aa3cfc9ce769527af3e4bb195 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Thu, 9 Oct 2025 10:46:26 +0200 Subject: [PATCH 1/8] better logging --- src/Databases/DataLake/GlueCatalog.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index de9b4f533f28..273e2100876a 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -428,7 +428,7 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet metadata_path = metadata_path.substr(pos + 1); } else - throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Metadata specific properties should be defined"); + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Failed to read table metadata, reason why table is unreadable: {}", table_metadata.getReasonWhyTableIsUnreadable()); if (!metadata_objects.get(metadata_path)) { From 184963d1d8b4e38dd334808915b5aae9fd0fd4ea Mon Sep 17 00:00:00 2001 From: Vasily Nemkov Date: Thu, 2 Oct 2025 14:10:17 +0200 Subject: [PATCH 2/8] Merge pull request #1053 from Altinity/backport/antalya-25.6.5/87733 Antalya 25.6.5 Backport of 87733: Fix handling of timestamp(tz) columns in Glue Catalog --- src/Databases/DataLake/GlueCatalog.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 273e2100876a..97fb812e6569 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -415,10 +415,12 @@ bool GlueCatalog::empty() const bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMetadata & table_metadata) const { String metadata_path; + String metadata_uri; if (auto table_specific_properties = table_metadata.getDataLakeSpecificProperties(); table_specific_properties.has_value()) { metadata_path = table_specific_properties->iceberg_metadata_file_location; + metadata_uri = metadata_path; if (metadata_path.starts_with("s3:/")) metadata_path = metadata_path.substr(5); @@ -430,22 +432,24 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet else throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Failed to read table metadata, reason why table is unreadable: {}", table_metadata.getReasonWhyTableIsUnreadable()); - if (!metadata_objects.get(metadata_path)) + if (!metadata_objects.get(metadata_uri)) { DB::ASTStorage * storage = table_engine_definition->as(); DB::ASTs args = storage->engine->arguments->children; - auto table_endpoint = settings.storage_endpoint; + String storage_endpoint = !settings.storage_endpoint.empty() ? settings.storage_endpoint : metadata_uri; + if (args.empty()) - args.emplace_back(std::make_shared(table_endpoint)); + args.emplace_back(std::make_shared(storage_endpoint)); else - args[0] = std::make_shared(table_endpoint); + args[0] = std::make_shared(storage_endpoint); - if (args.size() == 1 && table_metadata.hasStorageCredentials()) + if (args.size() == 1) { - auto storage_credentials = table_metadata.getStorageCredentials(); - if (storage_credentials) - storage_credentials->addCredentialsToEngineArgs(args); + if (table_metadata.hasStorageCredentials()) + table_metadata.getStorageCredentials()->addCredentialsToEngineArgs(args); + else if (!credentials.IsExpiredOrEmpty()) + DataLake::S3Credentials(credentials.GetAWSAccessKeyId(), credentials.GetAWSSecretKey(), credentials.GetSessionToken()).addCredentialsToEngineArgs(args); } auto storage_settings = std::make_shared(); @@ -464,9 +468,9 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet Poco::JSON::Parser parser; Poco::Dynamic::Var result = parser.parse(metadata_file); auto metadata_object = result.extract(); - metadata_objects.set(metadata_path, std::make_shared(metadata_object)); + metadata_objects.set(metadata_uri, std::make_shared(metadata_object)); } - auto metadata_object = *metadata_objects.get(metadata_path); + auto metadata_object = *metadata_objects.get(metadata_uri); auto current_schema_id = metadata_object->getValue("current-schema-id"); auto schemas = metadata_object->getArray(DB::Iceberg::f_schemas); for (size_t i = 0; i < schemas->size(); ++i) From 7fc341ca43d6543307c8a70c287139fbabfb4d15 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Sun, 12 Oct 2025 23:23:15 +0200 Subject: [PATCH 3/8] deduce metadata location based on table location --- src/Databases/DataLake/GlueCatalog.cpp | 82 ++++++++++++++++++- src/Databases/DataLake/GlueCatalog.h | 2 + .../DataLakes/DataLakeStorageSettings.h | 3 + 3 files changed, 83 insertions(+), 4 deletions(-) diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 97fb812e6569..dbeb9bd59e3b 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -317,11 +317,22 @@ bool GlueCatalog::tryGetTableMetadata( { result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = table_params.at("metadata_location")}); } + else if (const auto & location = table_outcome.GetStorageDescriptor().GetLocation(); !location.empty()) + { + std::string location_with_slash = location; + if (!location_with_slash.ends_with('/')) + location_with_slash += '/'; + + // Resolve the actual metadata file path based on table location + std::string resolved_metadata_path = resolveMetadataPathFromTableLocation(location_with_slash, result); + result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = resolved_metadata_path}); + } + else { - result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \ - "It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters", - database_name + "." + table_name)); + result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \ + "It means that it's unreadable with Glue catalog in ClickHouse, readable tables must have 'metadata_location' in table parameters", + database_name + "." + table_name)); } }; @@ -424,7 +435,7 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet if (metadata_path.starts_with("s3:/")) metadata_path = metadata_path.substr(5); - // Delete bucket + // Delete bucket from path std::size_t pos = metadata_path.find('/'); if (pos != std::string::npos) metadata_path = metadata_path.substr(pos + 1); @@ -491,6 +502,69 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet return false; } +String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const +{ + // Construct path to version-hint.text + String version_hint_path = table_location + "metadata/version-hint.text"; + + DB::ASTStorage * storage = table_engine_definition->as(); + DB::ASTs args = storage->engine->arguments->children; + + String storage_endpoint = !settings.storage_endpoint.empty() ? settings.storage_endpoint : table_location; + if (args.empty()) + args.emplace_back(std::make_shared(storage_endpoint)); + else + args[0] = std::make_shared(storage_endpoint); + + if (args.size() == 1 && table_metadata.hasStorageCredentials()) + { + auto storage_credentials = table_metadata.getStorageCredentials(); + if (storage_credentials) + storage_credentials->addCredentialsToEngineArgs(args); + } + + auto storage_settings = std::make_shared(); + storage_settings->loadFromSettingsChanges(settings.allChanged()); + auto configuration = std::make_shared(storage_settings); + DB::StorageObjectStorageConfiguration::initialize(*configuration, args, getContext(), false); + + auto object_storage = configuration->createObjectStorage(getContext(), true); + const auto & read_settings = getContext()->getReadSettings(); + + try + { + // Try to read version-hint.text to get the latest version + String version_hint_object_path = version_hint_path; + if (version_hint_object_path.starts_with("s3://")) + { + version_hint_object_path = version_hint_object_path.substr(5); + // Remove bucket from path + std::size_t pos = version_hint_object_path.find('/'); + if (pos != std::string::npos) + version_hint_object_path = version_hint_object_path.substr(pos + 1); + } + + DB::StoredObject version_hint_stored_object(version_hint_object_path); + auto version_hint_buf = object_storage->readObject(version_hint_stored_object, read_settings); + String version_str; + readString(version_str, *version_hint_buf); + + // Trim whitespace + boost::algorithm::trim(version_str); + + LOG_TRACE(log, "Read version {} from version-hint.text for table location '{}'", version_str, table_location); + + // Construct metadata file path: table_location/metadata/v{version}-metadata.json + return table_location + "metadata/v" + version_str + "-metadata.json"; + } + catch (...) + { + // If version-hint.text doesn't exist or is unreadable, fall back to metadata.json + LOG_TRACE(log, "Could not read version-hint.text from '{}', falling back to metadata.json", version_hint_path); + return table_location + "metadata/metadata.json"; + } +} + void GlueCatalog::createNamespaceIfNotExists(const String & namespace_name) const { Aws::Glue::Model::CreateDatabaseRequest create_request; diff --git a/src/Databases/DataLake/GlueCatalog.h b/src/Databases/DataLake/GlueCatalog.h index bed6e93c5dcc..3bf1769ac83e 100644 --- a/src/Databases/DataLake/GlueCatalog.h +++ b/src/Databases/DataLake/GlueCatalog.h @@ -81,6 +81,8 @@ class GlueCatalog final : public ICatalog, private DB::WithContext /// This method allows to clarify the actual type of the timestamp column. bool classifyTimestampTZ(const String & column_name, const TableMetadata & table_metadata) const; + String resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const; + mutable DB::CacheBase metadata_objects; }; diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h b/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h index 5c9ef47f8f01..eb6e2e117f8e 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h @@ -51,6 +51,9 @@ If enabled, indicates that metadata is taken from iceberg specification that is )", 0) \ DECLARE(String, iceberg_metadata_file_path, "", R"( Explicit path to desired Iceberg metadata file, should be relative to path in object storage. Make sense for table function use case only. +)", 0) \ + DECLARE(String, iceberg_table_location, "", R"( +Explicit path to Iceberg table location (warehouse). If no iceberg_metadata_file_path provided, it will be deduced using this parameter. )", 0) \ DECLARE(String, iceberg_metadata_table_uuid, "", R"( Explicit table UUID to read metadata for. Ignored if iceberg_metadata_file_path is set. From 6d15034f6935cf7e5167ad072c1b830d4de95572 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Mon, 13 Oct 2025 00:10:31 +0200 Subject: [PATCH 4/8] fix deducing metadata --- src/Databases/DataLake/GlueCatalog.cpp | 73 ++++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 5 deletions(-) diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index dbeb9bd59e3b..0f7d3097bd99 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -325,9 +325,16 @@ bool GlueCatalog::tryGetTableMetadata( // Resolve the actual metadata file path based on table location std::string resolved_metadata_path = resolveMetadataPathFromTableLocation(location_with_slash, result); - result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = resolved_metadata_path}); + if (resolved_metadata_path.empty()) + { + result.setTableIsNotReadable(fmt::format("Could not determine metadata_location of table `{}`. ", + database_name + "." + table_name)); + } + else + { + result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = resolved_metadata_path}); + } } - else { result.setTableIsNotReadable(fmt::format("Cannot read table `{}` because it has no metadata_location. " \ @@ -559,9 +566,65 @@ String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_lo } catch (...) { - // If version-hint.text doesn't exist or is unreadable, fall back to metadata.json - LOG_TRACE(log, "Could not read version-hint.text from '{}', falling back to metadata.json", version_hint_path); - return table_location + "metadata/metadata.json"; + // If version-hint.text doesn't exist or is unreadable, list all metadata files and select the latest + LOG_TRACE(log, "Could not read version-hint.text from '{}', trying to find latest metadata file", version_hint_path); + + try + { + String bucket_with_prefix; + String metadata_dir = table_location + "metadata/"; + String metadata_dir_path = metadata_dir; + + if (metadata_dir_path.starts_with("s3://")) + { + metadata_dir_path = metadata_dir_path.substr(5); + // Remove bucket from path + std::size_t pos = metadata_dir_path.find('/'); + if (pos != std::string::npos) + { + metadata_dir_path = metadata_dir_path.substr(pos + 1); + bucket_with_prefix = table_location.substr(0, pos + 6); + } + } + else + return ""; + + // List all files in metadata directory + DB::RelativePathsWithMetadata files; + object_storage->listObjects(metadata_dir_path, files, 0); + + // Filter for .metadata.json files and find the most recent one + String latest_metadata_file; + std::optional latest_metadata; + + for (const auto & file : files) + { + if (file->getPath().ends_with(".metadata.json")) + { + // Get file metadata to check last modified time + if (!latest_metadata.has_value() || + (file->metadata->last_modified > latest_metadata->last_modified)) + { + latest_metadata_file = file->getPath(); + latest_metadata = file->metadata; + } + } + } + + if (!latest_metadata_file.empty()) + { + LOG_TRACE(log, "Found latest metadata file: {}", latest_metadata_file); + return bucket_with_prefix + latest_metadata_file; + } + + LOG_TRACE(log, "No .metadata.json files found,"); + return ""; + } + catch (...) + { + LOG_TRACE(log, "Failed to list metadata directory"); + return ""; + } } } From c62b5646c87c3b1b211f4d6a04d28f1393c7b89c Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Mon, 13 Oct 2025 13:28:37 +0200 Subject: [PATCH 5/8] fix build --- src/Databases/DataLake/GlueCatalog.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 0f7d3097bd99..13a90d648ebe 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -533,7 +533,7 @@ String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_lo auto storage_settings = std::make_shared(); storage_settings->loadFromSettingsChanges(settings.allChanged()); auto configuration = std::make_shared(storage_settings); - DB::StorageObjectStorageConfiguration::initialize(*configuration, args, getContext(), false); + configuration->initialize(args, getContext(), false); auto object_storage = configuration->createObjectStorage(getContext(), true); const auto & read_settings = getContext()->getReadSettings(); @@ -556,17 +556,14 @@ String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_lo String version_str; readString(version_str, *version_hint_buf); - // Trim whitespace boost::algorithm::trim(version_str); LOG_TRACE(log, "Read version {} from version-hint.text for table location '{}'", version_str, table_location); - // Construct metadata file path: table_location/metadata/v{version}-metadata.json return table_location + "metadata/v" + version_str + "-metadata.json"; } catch (...) { - // If version-hint.text doesn't exist or is unreadable, list all metadata files and select the latest LOG_TRACE(log, "Could not read version-hint.text from '{}', trying to find latest metadata file", version_hint_path); try @@ -617,7 +614,7 @@ String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_lo return bucket_with_prefix + latest_metadata_file; } - LOG_TRACE(log, "No .metadata.json files found,"); + LOG_TRACE(log, "No <...>.metadata.json files found,"); return ""; } catch (...) From 3feb31aca355506b1e071e87e953c328e84371ae Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Tue, 14 Oct 2025 11:42:26 +0200 Subject: [PATCH 6/8] remove unnecessary change --- src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h b/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h index eb6e2e117f8e..5c9ef47f8f01 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeStorageSettings.h @@ -51,9 +51,6 @@ If enabled, indicates that metadata is taken from iceberg specification that is )", 0) \ DECLARE(String, iceberg_metadata_file_path, "", R"( Explicit path to desired Iceberg metadata file, should be relative to path in object storage. Make sense for table function use case only. -)", 0) \ - DECLARE(String, iceberg_table_location, "", R"( -Explicit path to Iceberg table location (warehouse). If no iceberg_metadata_file_path provided, it will be deduced using this parameter. )", 0) \ DECLARE(String, iceberg_metadata_table_uuid, "", R"( Explicit table UUID to read metadata for. Ignored if iceberg_metadata_file_path is set. From 75e15997a36050d4df6d6bb911fa203de617a0f4 Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Tue, 14 Oct 2025 16:50:45 +0200 Subject: [PATCH 7/8] poke CI From 0402d8d10d663057848e5d0d2c56ff438d558eef Mon Sep 17 00:00:00 2001 From: Andrey Zvonov Date: Wed, 15 Oct 2025 16:41:14 +0200 Subject: [PATCH 8/8] polish after review --- src/Databases/DataLake/GlueCatalog.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Databases/DataLake/GlueCatalog.cpp b/src/Databases/DataLake/GlueCatalog.cpp index 13a90d648ebe..e76e81c6c3c3 100644 --- a/src/Databases/DataLake/GlueCatalog.cpp +++ b/src/Databases/DataLake/GlueCatalog.cpp @@ -317,8 +317,10 @@ bool GlueCatalog::tryGetTableMetadata( { result.setDataLakeSpecificProperties(DataLakeSpecificProperties{.iceberg_metadata_file_location = table_params.at("metadata_location")}); } - else if (const auto & location = table_outcome.GetStorageDescriptor().GetLocation(); !location.empty()) + else if (table_outcome.GetStorageDescriptor().LocationHasBeenSet()) { + const auto & location = table_outcome.GetStorageDescriptor().GetLocation(); + std::string location_with_slash = location; if (!location_with_slash.ends_with('/')) location_with_slash += '/'; @@ -509,6 +511,9 @@ bool GlueCatalog::classifyTimestampTZ(const String & column_name, const TableMet return false; } +/// This function tries two resolve the metadata file path by following means: +/// 1. Tries to read version-hint.text to get the latest version. +/// 2. Lists all *.metadata.json files in the metadata directory and takes the most recent one. String GlueCatalog::resolveMetadataPathFromTableLocation(const String & table_location, const TableMetadata & table_metadata) const { // Construct path to version-hint.text