diff --git a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h index d49375b03071..2ac14046f5f7 100644 --- a/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h +++ b/src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h @@ -169,7 +169,7 @@ struct KeyValuePairExtractorReferenceMap : extractKV::KeyValuePairExtractor & map) + uint64_t extract(std::string_view data, std::map & map) { auto pair_writer = typename StateHandler::PairWriter(map); return extractImpl(data, pair_writer); diff --git a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h index a7169f21a8fa..60cc5192ff14 100644 --- a/src/Functions/keyvaluepair/impl/StateHandlerImpl.h +++ b/src/Functions/keyvaluepair/impl/StateHandlerImpl.h @@ -13,7 +13,6 @@ #include #include #include -#include namespace DB @@ -577,13 +576,13 @@ struct ReferencesMapStateHandler : public StateHandlerImpl * */ class PairWriter { - absl::flat_hash_map & map; + std::map & map; std::string_view key; std::string_view value; public: - explicit PairWriter(absl::flat_hash_map & map_) + explicit PairWriter(std::map & map_) : map(map_) {} diff --git a/src/Storages/HivePartitioningUtils.cpp b/src/Storages/HivePartitioningUtils.cpp index a7496cb98cc0..2ed5b002ba98 100644 --- a/src/Storages/HivePartitioningUtils.cpp +++ b/src/Storages/HivePartitioningUtils.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,7 @@ namespace Setting namespace ErrorCodes { extern const int INCORRECT_DATA; + extern const int BAD_ARGUMENTS; } namespace HivePartitioningUtils @@ -83,7 +85,14 @@ NamesAndTypesList extractHivePartitionColumnsFromPath( { if (const auto type = tryInferDataTypeByEscapingRule(value, format_settings ? *format_settings : getFormatSettings(context), FormatSettings::EscapingRule::Raw)) { - hive_partition_columns_to_read_from_file_path.emplace_back(key, type); + if (type->canBeInsideLowCardinality()) + { + hive_partition_columns_to_read_from_file_path.emplace_back(key, std::make_shared(type)); + } + else + { + hive_partition_columns_to_read_from_file_path.emplace_back(key, type); + } } else { @@ -122,6 +131,29 @@ void addPartitionColumnsToChunk( } } +void sanityCheckSchemaAndHivePartitionColumns(const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, const ColumnsDescription & storage_columns) +{ + for (const auto & column : hive_partition_columns_to_read_from_file_path) + { + if (!storage_columns.has(column.name)) + { + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "All hive partitioning columns must be present in the schema. Missing column: {}. " + "If you do not want to use hive partitioning, try `use_hive_partitioning=0` and/or `partition_strategy != hive`", + column.name); + } + } + + if (storage_columns.size() == hive_partition_columns_to_read_from_file_path.size()) + { + throw Exception( + ErrorCodes::INCORRECT_DATA, + "A hive partitioned file can't contain only partition columns. " + "Try reading it with `use_hive_partitioning=0` and/or `partition_strategy != hive`"); + } +} + void extractPartitionColumnsFromPathAndEnrichStorageColumns( ColumnsDescription & storage_columns, NamesAndTypesList & hive_partition_columns_to_read_from_file_path, @@ -144,13 +176,96 @@ void extractPartitionColumnsFromPathAndEnrichStorageColumns( } } } +} - if (hive_partition_columns_to_read_from_file_path.size() == storage_columns.size()) +HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage( + ColumnsDescription & columns, + const StorageObjectStorage::ConfigurationPtr & configuration, + const std::string & sample_path, + bool inferred_schema, + std::optional format_settings, + ContextPtr context) +{ + NamesAndTypesList hive_partition_columns_to_read_from_file_path; + NamesAndTypesList file_columns; + + /* + * If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression. + * There is no need to read from the file's path. + * + * Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path. + * And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns + */ + if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE) + { + hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns(); + } + else if (context->getSettingsRef()[Setting::use_hive_partitioning]) + { + extractPartitionColumnsFromPathAndEnrichStorageColumns( + columns, + hive_partition_columns_to_read_from_file_path, + sample_path, + inferred_schema, + format_settings, + context); + } + + sanityCheckSchemaAndHivePartitionColumns(hive_partition_columns_to_read_from_file_path, columns); + + if (configuration->partition_columns_in_data_file) + { + file_columns = columns.getAllPhysical(); + } + else + { + std::unordered_set hive_partition_columns_to_read_from_file_path_set; + + for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path) + { + hive_partition_columns_to_read_from_file_path_set.insert(name); + } + + for (const auto & [name, type] : columns.getAllPhysical()) + { + if (!hive_partition_columns_to_read_from_file_path_set.contains(name)) + { + file_columns.emplace_back(name, type); + } + } + } + + return {hive_partition_columns_to_read_from_file_path, file_columns}; +} + +HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForFileURLLikeStorage( + ColumnsDescription & columns, + const std::string & sample_path, + bool inferred_schema, + std::optional format_settings, + ContextPtr context) +{ + NamesAndTypesList hive_partition_columns_to_read_from_file_path; + NamesAndTypesList file_columns; + + if (context->getSettingsRef()[Setting::use_hive_partitioning]) { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "A hive partitioned file can't contain only partition columns. Try reading it with `use_hive_partitioning=0`"); + extractPartitionColumnsFromPathAndEnrichStorageColumns( + columns, + hive_partition_columns_to_read_from_file_path, + sample_path, + inferred_schema, + format_settings, + context); } + + sanityCheckSchemaAndHivePartitionColumns(hive_partition_columns_to_read_from_file_path, columns); + + /// Partition strategy is not implemented for File/URL storages, + /// so there is no option to set whether hive partition columns are in the data file or not. + file_columns = columns.getAllPhysical(); + + return {hive_partition_columns_to_read_from_file_path, file_columns}; } } diff --git a/src/Storages/HivePartitioningUtils.h b/src/Storages/HivePartitioningUtils.h index b532aee67a13..0550b5694750 100644 --- a/src/Storages/HivePartitioningUtils.h +++ b/src/Storages/HivePartitioningUtils.h @@ -1,7 +1,8 @@ #pragma once -#include #include +#include +#include namespace DB { @@ -10,7 +11,7 @@ class Chunk; namespace HivePartitioningUtils { -using HivePartitioningKeysAndValues = absl::flat_hash_map; +using HivePartitioningKeysAndValues = std::map; HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const std::string & path); @@ -19,10 +20,20 @@ void addPartitionColumnsToChunk( const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, const std::string & path); -void extractPartitionColumnsFromPathAndEnrichStorageColumns( - ColumnsDescription & storage_columns, - NamesAndTypesList & hive_partition_columns_to_read_from_file_path, - const std::string & path, +/// Hive partition columns and file columns (Note that file columns might not contain the hive partition columns) +using HivePartitionColumnsWithFileColumnsPair = std::pair; + +HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage( + ColumnsDescription & columns, + const StorageObjectStorage::ConfigurationPtr & configuration, + const std::string & sample_path, + bool inferred_schema, + std::optional format_settings, + ContextPtr context); + +HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForFileURLLikeStorage( + ColumnsDescription & columns, + const std::string & sample_path, bool inferred_schema, std::optional format_settings, ContextPtr context); diff --git a/src/Storages/IPartitionStrategy.cpp b/src/Storages/IPartitionStrategy.cpp index 5ebdb249e661..6acf9c9a3a73 100644 --- a/src/Storages/IPartitionStrategy.cpp +++ b/src/Storages/IPartitionStrategy.cpp @@ -118,7 +118,7 @@ namespace if (file_format.empty() || file_format == "auto") { - throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning"); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "File format can't be empty for hive style partitioning"); } const auto partition_key_description = KeyDescription::getKeyFromAST(partition_by, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index f910691767aa..738883b526eb 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -172,56 +172,13 @@ StorageObjectStorage::StorageObjectStorage( } } - /* - * If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression. - * There is no need to read from the file's path. - * - * Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path. - * And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns - */ - if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE) - { - hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns(); - } - else if (context->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - columns_in_table_or_function_definition.empty(), - format_settings, - context); - } - - if (hive_partition_columns_to_read_from_file_path.size() == columns.size()) - { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`"); - } - - if (configuration->partition_columns_in_data_file) - { - file_columns = columns; - } - else - { - std::unordered_set hive_partition_columns_to_read_from_file_path_set; - - for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path) - { - hive_partition_columns_to_read_from_file_path_set.insert(name); - } - - for (const auto & [name, type] : columns.getAllPhysical()) - { - if (!hive_partition_columns_to_read_from_file_path_set.contains(name)) - { - file_columns.add({name, type}); - } - } - } + std::tie(hive_partition_columns_to_read_from_file_path, file_columns) = HivePartitioningUtils::setupHivePartitioningForObjectStorage( + columns, + configuration, + sample_path, + columns_in_table_or_function_definition.empty(), + format_settings, + context); // Assert file contains at least one column. The assertion only takes place if we were able to deduce the schema. The storage might be empty. if (!columns.empty() && file_columns.empty()) @@ -500,15 +457,13 @@ void StorageObjectStorage::read( getName()); } - auto all_file_columns = file_columns.getAll(); - auto read_from_format_info = configuration->prepareReadingFromFormat( object_storage, column_names, storage_snapshot, supportsSubsetOfColumns(local_context), local_context, - PrepareReadingFromFormatHiveParams { all_file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() }); + PrepareReadingFromFormatHiveParams { file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() }); const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef()[Setting::optimize_count_from_files]; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 0c75431cd1c1..a4407a426647 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -177,7 +177,7 @@ class StorageObjectStorage : public IStorage bool update_configuration_on_read_write = true; NamesAndTypesList hive_partition_columns_to_read_from_file_path; - ColumnsDescription file_columns; + NamesAndTypesList file_columns; LoggerPtr log; }; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 424b0d5bfb52..6e3af8de409b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -29,7 +29,6 @@ namespace Setting namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int INCORRECT_DATA; } String StorageObjectStorageCluster::getPathSample(ContextPtr context) @@ -88,37 +87,14 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning] && !configuration->isDataLakeConfiguration() && !configuration->partition_strategy) sample_path = getPathSample(context_); - /* - * If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression. - * There is no need to read from the filepath. - * - * Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path. - * And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns - */ - if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE) - { - hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns(); - } - else if (context_->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - columns_in_table_or_function_definition.empty(), - std::nullopt, - context_ - ); - } - - if (hive_partition_columns_to_read_from_file_path.size() == columns.size()) - { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`"); - } - - /// Hive: Not building the file_columns like `StorageObjectStorage` does because it is not necessary to do it here. + /// Not grabbing the file_columns because it is not necessary to do it here. + std::tie(hive_partition_columns_to_read_from_file_path, std::ignore) = HivePartitioningUtils::setupHivePartitioningForObjectStorage( + columns, + configuration, + sample_path, + columns_in_table_or_function_definition.empty(), + std::nullopt, + context_); StorageInMemoryMetadata metadata; metadata.setColumns(columns); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index b4ae724abd03..c55b62b350fb 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -275,6 +275,16 @@ Chunk StorageObjectStorageSource::generate() const auto path = getUniqueStoragePathIdentifier(*configuration, *object_info, false); + /// The order is important, hive partition columns must be added before virtual columns + /// because they are part of the schema + if (!read_from_format_info.hive_partition_columns_to_read_from_file_path.empty()) + { + HivePartitioningUtils::addPartitionColumnsToChunk( + chunk, + read_from_format_info.hive_partition_columns_to_read_from_file_path, + path); + } + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, @@ -285,15 +295,6 @@ Chunk StorageObjectStorageSource::generate() .etag = &(object_info->metadata->etag)}, read_context); - // The order is important, it must be added after virtual columns.. - if (!read_from_format_info.hive_partition_columns_to_read_from_file_path.empty()) - { - HivePartitioningUtils::addPartitionColumnsToChunk( - chunk, - read_from_format_info.hive_partition_columns_to_read_from_file_path, - path); - } - if (chunk_size && chunk.hasColumns()) { const auto * object_with_partition_columns_info = dynamic_cast(object_info.get()); diff --git a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp index ac873743a81a..ce595a1eb1cd 100644 --- a/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/registerStorageObjectStorage.cpp @@ -59,12 +59,16 @@ createStorageObjectStorage(const StorageFactory::Arguments & args, StorageObject if (args.storage_def->partition_by) partition_by = args.storage_def->partition_by->clone(); + ContextMutablePtr context_copy = Context::createCopy(args.getContext()); + Settings settings_copy = args.getLocalContext()->getSettingsCopy(); + context_copy->setSettings(settings_copy); + return std::make_shared( configuration, // We only want to perform write actions (e.g. create a container in Azure) when the table is being created, // and we want to avoid it when we load the table after a server restart. configuration->createObjectStorage(context, /* is_readonly */ args.mode != LoadingStrictnessLevel::CREATE), - args.getContext(), /// Use global context. + context_copy, args.table_id, args.columns, args.constraints, diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 5a3637ec33c2..48c11e8d8e9e 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1148,19 +1148,12 @@ void StorageFile::setStorageMetadata(CommonArguments args) auto & storage_columns = storage_metadata.columns; - if (args.getContext()->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - args.columns.empty(), - format_settings, - args.getContext()); - } - - /// If the `partition_strategy` argument is ever implemented for File storage, this must be updated - file_columns = storage_columns.getAllPhysical(); + std::tie(hive_partition_columns_to_read_from_file_path, std::ignore) = HivePartitioningUtils::setupHivePartitioningForFileURLLikeStorage( + storage_columns, + sample_path, + args.columns.empty(), + format_settings, + args.getContext()); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns)); setInMemoryMetadata(storage_metadata); @@ -1511,6 +1504,16 @@ Chunk StorageFileSource::generate() chunk_size = input_format->getApproxBytesReadForChunk(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); + /// The order is important, hive partition columns must be added before virtual columns + /// because they are part of the schema + if (!hive_partition_columns_to_read_from_file_path.empty()) + { + HivePartitioningUtils::addPartitionColumnsToChunk( + chunk, + hive_partition_columns_to_read_from_file_path, + current_path); + } + /// Enrich with virtual columns. VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, requested_virtual_columns, @@ -1521,15 +1524,6 @@ Chunk StorageFileSource::generate() .last_modified = current_file_last_modified }, getContext()); - // The order is important, it must be added after virtual columns.. - if (!hive_partition_columns_to_read_from_file_path.empty()) - { - HivePartitioningUtils::addPartitionColumnsToChunk( - chunk, - hive_partition_columns_to_read_from_file_path, - current_path); - } - return chunk; } @@ -2192,9 +2186,10 @@ void registerStorageFile(StorageFactory & factory) "File", [](const StorageFactory::Arguments & factory_args) { + auto context = factory_args.getLocalContext(); StorageFile::CommonArguments storage_args { - WithContext(factory_args.getContext()), + WithContext(context), factory_args.table_id, {}, {}, @@ -2281,7 +2276,7 @@ void registerStorageFile(StorageFactory & factory) return std::make_shared(source_fd, storage_args); /// User's file - return std::make_shared(source_path, factory_args.getContext()->getUserFilesPath(), false, storage_args); + return std::make_shared(source_path, context->getUserFilesPath(), false, storage_args); }, storage_features); } diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index 1946fdc8c77b..cd661c10d1da 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -67,19 +67,13 @@ StorageFileCluster::StorageFileCluster( auto & storage_columns = storage_metadata.columns; - if (context->getSettingsRef()[Setting::use_hive_partitioning]) - { - const std::string sample_path = paths.empty() ? "" : paths.front(); - - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - columns_.empty(), - std::nullopt, - context - ); - } + /// Not grabbing the file_columns because it is not necessary to do it here. + std::tie(hive_partition_columns_to_read_from_file_path, std::ignore) = HivePartitioningUtils::setupHivePartitioningForFileURLLikeStorage( + storage_columns, + paths.empty() ? "" : paths.front(), + columns_.empty(), + std::nullopt, + context); storage_metadata.setConstraints(constraints_); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns)); diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 354fd8c3b2b5..4bd77a879d68 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -189,19 +189,12 @@ IStorageURLBase::IStorageURLBase( auto & storage_columns = storage_metadata.columns; - if (context_->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - getSampleURI(uri, context_), - columns_.empty(), - format_settings, - context_); - } - - /// If the `partition_strategy` argument is ever implemented for URL storage, this must be updated - file_columns = storage_columns.getAllPhysical(); + std::tie(hive_partition_columns_to_read_from_file_path, file_columns) = HivePartitioningUtils::setupHivePartitioningForFileURLLikeStorage( + storage_columns, + getSampleURI(uri, context_), + columns_.empty(), + format_settings, + context_); auto virtual_columns_desc = VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns); if (!storage_metadata.getColumns().has("_headers")) @@ -476,16 +469,9 @@ Chunk StorageURLSource::generate() chunk_size = input_format->getApproxBytesReadForChunk(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( - chunk, - requested_virtual_columns, - { - .path = curr_uri.getPath(), - .size = current_file_size, - }, - getContext()); - // The order is important, hive partition columns must be added after virtual columns + /// The order is important, hive partition columns must be added before virtual columns + /// because they are part of the schema if (!hive_partition_columns_to_read_from_file_path.empty()) { const auto path = curr_uri.getPath(); @@ -495,6 +481,15 @@ Chunk StorageURLSource::generate() path); } + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( + chunk, + requested_virtual_columns, + { + .path = curr_uri.getPath(), + .size = current_file_size, + }, + getContext()); + chassert(dynamic_cast(read_buf.get())); if (need_headers_virtual_column) { @@ -805,7 +800,7 @@ std::function IStorageURLBase::getReadPOSTDataCallback( return nullptr; } -namespace +namespace internal { class ReadBufferIterator : public IReadBufferIterator, WithContext { @@ -1054,7 +1049,7 @@ std::pair IStorageURLBase::getTableStructureAndForma else urls_to_check = {uri}; - ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); + internal::ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); if (format) return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); @@ -1689,6 +1684,7 @@ void registerStorageURL(StorageFactory & factory) ASTs & engine_args = args.engine_args; auto configuration = StorageURL::getConfiguration(engine_args, args.getLocalContext()); auto format_settings = StorageURL::getFormatSettingsFromArgs(args); + auto context = args.getLocalContext(); ASTPtr partition_by; if (args.storage_def->partition_by) @@ -1702,7 +1698,7 @@ void registerStorageURL(StorageFactory & factory) args.columns, args.constraints, args.comment, - args.getContext(), + context, configuration.compression_method, configuration.headers, configuration.http_method, diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index fff85117b37a..32011140558c 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -84,16 +84,13 @@ StorageURLCluster::StorageURLCluster( auto & storage_columns = storage_metadata.columns; - if (context->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - getSampleURI(uri, context), - columns_.empty(), - std::nullopt, - context); - } + /// Not grabbing the file_columns because it is not necessary to do it here. + std::tie(hive_partition_columns_to_read_from_file_path, std::ignore) = HivePartitioningUtils::setupHivePartitioningForFileURLLikeStorage( + storage_columns, + getSampleURI(uri, context), + columns_.empty(), + std::nullopt, + context); auto virtual_columns_desc = VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns); if (!storage_metadata.getColumns().has("_headers")) diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 82c40020116b..5a99021221af 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -36,15 +36,17 @@ ReadFromFormatInfo prepareReadingFromFormat( columns_to_read.push_back(column_name); } - /// Create header for Source that will contain all requested columns including virtual and hive columns at the end - /// (because they will be added to the chunk after reading regular columns). info.source_header = storage_snapshot->getSampleBlockForColumns(columns_to_read); - for (const auto & requested_virtual_column : info.requested_virtual_columns) - info.source_header.insert({requested_virtual_column.type->createColumn(), requested_virtual_column.type, requested_virtual_column.name}); + /// Create header for Source that will contain all requested columns including hive columns (which should be part of the schema) and virtual at the end + /// (because they will be added to the chunk after reading regular columns). + /// The order is important, hive partition columns must be added before virtual columns because they are part of the schema for (const auto & column_from_file_path : info.hive_partition_columns_to_read_from_file_path) info.source_header.insert({column_from_file_path.type->createColumn(), column_from_file_path.type, column_from_file_path.name}); + for (const auto & requested_virtual_column : info.requested_virtual_columns) + info.source_header.insert({requested_virtual_column.type->createColumn(), requested_virtual_column.type, requested_virtual_column.name}); + /// Set requested columns that should be read from data. info.requested_columns = storage_snapshot->getColumnsByNames(GetColumnsOptions(GetColumnsOptions::All).withSubcolumns(), columns_to_read); diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index 16407d4c4d53..42797bed4b45 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace DB @@ -114,9 +115,22 @@ ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context archive_info = StorageFile::getArchiveInfo(path_to_archive, filename, context->getUserFilesPath(), context, total_bytes_to_read); + ColumnsDescription columns; if (format == "auto") - return StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context, archive_info).first; - return StorageFile::getTableStructureFromFile(format, paths, compression_method, std::nullopt, context, archive_info); + columns = StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context, archive_info).first; + else + columns = StorageFile::getTableStructureFromFile(format, paths, compression_method, std::nullopt, context, archive_info); + + auto sample_path = paths.empty() ? String{} : paths.front(); + + HivePartitioningUtils::setupHivePartitioningForFileURLLikeStorage( + columns, + sample_path, + /* inferred_schema */ true, + /* format_settings */ std::nullopt, + context); + + return columns; } return parseColumnsListFromString(structure, context); diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index b36bf1b63e3e..11a6f18f92e0 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -25,7 +25,7 @@ #include #include #include - +#include namespace DB { @@ -147,6 +147,14 @@ ColumnsDescription TableFunctionObjectStorage< sample_path, context); + HivePartitioningUtils::setupHivePartitioningForObjectStorage( + columns, + configuration, + sample_path, + /* inferred_schema */ true, + /* format_settings */ std::nullopt, + context); + return columns; } return parseColumnsListFromString(configuration->structure, context); diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index 7b5d62a03a43..9a2e19b20da9 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -18,6 +18,7 @@ #include #include +#include namespace DB @@ -113,7 +114,7 @@ StoragePtr TableFunctionURL::getStorage( format, compression_method, StorageID(getDatabaseName(), table_name), - getActualTableStructure(global_context, /* is_insert_query */ true), + columns, ConstraintsDescription{}, configuration); } @@ -138,21 +139,36 @@ ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context, { if (structure == "auto") { + ColumnsDescription columns; + context->checkAccess(getSourceAccessType()); if (format == "auto") - return StorageURL::getTableStructureAndFormatFromData( - filename, - chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), - configuration.headers, - std::nullopt, - context).first; - - return StorageURL::getTableStructureFromData(format, + { + columns = StorageURL::getTableStructureAndFormatFromData( + filename, + chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), + configuration.headers, + std::nullopt, + context).first; + } + else + { + columns = StorageURL::getTableStructureFromData(format, + filename, + chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), + configuration.headers, + std::nullopt, + context); + } + + HivePartitioningUtils::setupHivePartitioningForFileURLLikeStorage( + columns, filename, - chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), - configuration.headers, - std::nullopt, + /* inferred_schema */ true, + /* format_settings */ std::nullopt, context); + + return columns; } return parseColumnsListFromString(structure, context); diff --git a/src/TableFunctions/TableFunctionURLCluster.cpp b/src/TableFunctions/TableFunctionURLCluster.cpp index 84354d044143..0ec3921f9987 100644 --- a/src/TableFunctions/TableFunctionURLCluster.cpp +++ b/src/TableFunctions/TableFunctionURLCluster.cpp @@ -36,7 +36,7 @@ StoragePtr TableFunctionURLCluster::getStorage( format, compression_method, StorageID(getDatabaseName(), table_name), - getActualTableStructure(context, /* is_insert_query */ true), + getActualTableStructure(context, true), ConstraintsDescription{}, configuration); } diff --git a/tests/queries/0_stateless/01545_url_file_format_settings.sql b/tests/queries/0_stateless/01545_url_file_format_settings.sql index 21991db98547..8563f242c3bf 100644 --- a/tests/queries/0_stateless/01545_url_file_format_settings.sql +++ b/tests/queries/0_stateless/01545_url_file_format_settings.sql @@ -1,3 +1,7 @@ +-- Tags: no-parallel + +set use_hive_partitioning=0; -- required because of "?query=select" + create table file_delim(a int, b int) engine File(CSV, '01545_url_file_format_settings.csv') settings format_csv_delimiter = '|'; truncate table file_delim; diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index 85afdea228d2..a481cf0a28f1 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -31,7 +31,7 @@ Elizabeth Delgado Elizabeth Cross 42 2020-01-01 [1,2,3] 42.42 -Array(Int64) Float64 +Array(Int64) LowCardinality(Float64) 101 2071 2071 diff --git a/tests/queries/0_stateless/03363_hive_style_partition.reference b/tests/queries/0_stateless/03363_hive_style_partition.reference index a340a0866d52..26299c33c0a1 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition.reference +++ b/tests/queries/0_stateless/03363_hive_style_partition.reference @@ -34,3 +34,4 @@ test/t_03363_function/year=2025/country=/.parquet 11 1 3 USA 2022 1 +1 diff --git a/tests/queries/0_stateless/03363_hive_style_partition.sql b/tests/queries/0_stateless/03363_hive_style_partition.sql index e701fdba0681..425a12d7d2e4 100644 --- a/tests/queries/0_stateless/03363_hive_style_partition.sql +++ b/tests/queries/0_stateless/03363_hive_style_partition.sql @@ -1,6 +1,8 @@ -- Tags: no-parallel, no-fasttest, no-random-settings -DROP TABLE IF EXISTS t_03363_parquet, t_03363_csv; +SET allow_suspicious_low_cardinality_types=1; + +DROP TABLE IF EXISTS t_03363_parquet, t_03363_csv, s3_table_half_schema_with_format; CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partition_strategy='hive') @@ -71,6 +73,21 @@ select * from s3(s3_conn, filename='t_03363_function_write_down_partition_column -- only partition columns INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_parquet', format=Parquet, partition_strategy='hive') PARTITION BY (year, country) SELECT 2020 as year, 'Brazil' as country; -- {serverError INCORRECT_DATA}; +-- Schema specified, but the hive partition column is missing in the schema (present in the data tho) +INSERT INTO FUNCTION s3(s3_conn, filename='half_baked', format=Parquet, partition_strategy='hive') PARTITION BY year SELECT 1 AS key, 2020 AS year; + +-- Should fail because schema does not match since it is lacking the partition columns and `use_hive_partitioning=1` +CREATE TABLE s3_table_half_schema_with_format (key UInt64) engine=S3(s3_conn, filename='half_baked/**.parquet', format=Parquet) SETTINGS use_hive_partitioning=1; -- {serverError BAD_ARGUMENTS} + +-- Should succeed because hive is off +CREATE TABLE s3_table_half_schema_with_format (key UInt64) engine=S3(s3_conn, filename='half_baked/**.parquet', format=Parquet) SETTINGS use_hive_partitioning=0; + +-- Should succeed and not return hive columns. Distinct because maybe MinIO isn't cleaned up +SELECT DISTINCT * FROM s3_table_half_schema_with_format; + +-- Should fail because the column year does not exist +SELECT year, * FROM s3_table_half_schema_with_format; -- {serverError UNKNOWN_IDENTIFIER} + -- hive with partition id placeholder CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) ENGINE = S3(s3_conn, filename = 't_03363_parquet/{_partition_id}', format = Parquet, partition_strategy='hive') @@ -121,4 +138,10 @@ CREATE TABLE t_invalid_expression (year UInt16, country String, counter Float64) CREATE TABLE t_03363_iceberg ENGINE=IcebergS3(s3_conn, filename = 'iceberg_data/default/t_iceberg/', format='parquet', url = 'http://minio1:9001/bucket/', partition_strategy='WILDCARD'); -- {serverError BAD_ARGUMENTS} CREATE TABLE t_03363_iceberg ENGINE=IcebergS3(s3_conn, filename = 'iceberg_data/default/t_iceberg/', format='parquet', url = 'http://minio1:9001/bucket/', partition_strategy='HIVE'); -- {serverError BAD_ARGUMENTS} -DROP TABLE IF EXISTS t_03363_parquet, t_03363_csv; +-- Should throw because format is not present and it is mandatory for hive strategy and it should not be a LOGICAL_ERROR +CREATE TABLE t_03363_hive_requires_format (c0 Int) ENGINE = S3(s3_conn, partition_strategy = 'hive') PARTITION BY (c0); -- {serverError BAD_ARGUMENTS} + +-- Should throw because hive strategy does not allow partition columns to be the only columns +CREATE TABLE t_03363_hive_only_partition_columns (country String, year UInt16) ENGINE = S3(s3_conn, partition_strategy='hive') PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; + +DROP TABLE IF EXISTS t_03363_parquet, t_03363_csv, s3_table_half_schema_with_format;