Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Functions/keyvaluepair/impl/CHKeyValuePairExtractor.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ struct KeyValuePairExtractorReferenceMap : extractKV::KeyValuePairExtractor<extr
explicit KeyValuePairExtractorReferenceMap(const extractKV::Configuration & configuration_, std::size_t max_number_of_pairs_)
: KeyValuePairExtractor(configuration_, max_number_of_pairs_) {}

uint64_t extract(std::string_view data, absl::flat_hash_map<std::string_view, std::string_view> & map)
uint64_t extract(std::string_view data, std::map<std::string_view, std::string_view> & map)
{
auto pair_writer = typename StateHandler::PairWriter(map);
return extractImpl(data, pair_writer);
Expand Down
5 changes: 2 additions & 3 deletions src/Functions/keyvaluepair/impl/StateHandlerImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
#include <string_view>
#include <string>
#include <vector>
#include <absl/container/flat_hash_map.h>


namespace DB
Expand Down Expand Up @@ -577,13 +576,13 @@ struct ReferencesMapStateHandler : public StateHandlerImpl<false>
* */
class PairWriter
{
absl::flat_hash_map<std::string_view, std::string_view> & map;
std::map<std::string_view, std::string_view> & map;

std::string_view key;
std::string_view value;

public:
explicit PairWriter(absl::flat_hash_map<std::string_view, std::string_view> & map_)
explicit PairWriter(std::map<std::string_view, std::string_view> & map_)
: map(map_)
{}

Expand Down
125 changes: 120 additions & 5 deletions src/Storages/HivePartitioningUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <Core/Settings.h>
#include <Interpreters/Context.h>
#include <Interpreters/convertFieldToType.h>
#include <DataTypes/DataTypeLowCardinality.h>
#include <Functions/keyvaluepair/impl/KeyValuePairExtractorBuilder.h>
#include <Functions/keyvaluepair/impl/DuplicateKeyFoundException.h>
#include <Formats/EscapingRuleUtils.h>
Expand All @@ -20,6 +21,7 @@ namespace Setting
namespace ErrorCodes
{
extern const int INCORRECT_DATA;
extern const int BAD_ARGUMENTS;
}

namespace HivePartitioningUtils
Expand Down Expand Up @@ -83,7 +85,14 @@ NamesAndTypesList extractHivePartitionColumnsFromPath(
{
if (const auto type = tryInferDataTypeByEscapingRule(value, format_settings ? *format_settings : getFormatSettings(context), FormatSettings::EscapingRule::Raw))
{
hive_partition_columns_to_read_from_file_path.emplace_back(key, type);
if (type->canBeInsideLowCardinality())
{
hive_partition_columns_to_read_from_file_path.emplace_back(key, std::make_shared<DataTypeLowCardinality>(type));
}
else
{
hive_partition_columns_to_read_from_file_path.emplace_back(key, type);
}
}
else
{
Expand Down Expand Up @@ -122,6 +131,29 @@ void addPartitionColumnsToChunk(
}
}

void sanityCheckSchemaAndHivePartitionColumns(const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, const ColumnsDescription & storage_columns)
{
for (const auto & column : hive_partition_columns_to_read_from_file_path)
{
if (!storage_columns.has(column.name))
{
throw Exception(
ErrorCodes::BAD_ARGUMENTS,
"All hive partitioning columns must be present in the schema. Missing column: {}. "
"If you do not want to use hive partitioning, try `use_hive_partitioning=0` and/or `partition_strategy != hive`",
column.name);
}
}

if (storage_columns.size() == hive_partition_columns_to_read_from_file_path.size())
{
throw Exception(
ErrorCodes::INCORRECT_DATA,
"A hive partitioned file can't contain only partition columns. "
"Try reading it with `use_hive_partitioning=0` and/or `partition_strategy != hive`");
}
}

void extractPartitionColumnsFromPathAndEnrichStorageColumns(
ColumnsDescription & storage_columns,
NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
Expand All @@ -144,13 +176,96 @@ void extractPartitionColumnsFromPathAndEnrichStorageColumns(
}
}
}
}

if (hive_partition_columns_to_read_from_file_path.size() == storage_columns.size())
HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage(
ColumnsDescription & columns,
const StorageObjectStorage::ConfigurationPtr & configuration,
const std::string & sample_path,
bool inferred_schema,
std::optional<FormatSettings> format_settings,
ContextPtr context)
{
NamesAndTypesList hive_partition_columns_to_read_from_file_path;
NamesAndTypesList file_columns;

/*
* If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression.
* There is no need to read from the file's path.
*
* Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path.
* And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns
*/
if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE)
{
hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns();
}
else if (context->getSettingsRef()[Setting::use_hive_partitioning])
{
extractPartitionColumnsFromPathAndEnrichStorageColumns(
columns,
hive_partition_columns_to_read_from_file_path,
sample_path,
inferred_schema,
format_settings,
context);
}

sanityCheckSchemaAndHivePartitionColumns(hive_partition_columns_to_read_from_file_path, columns);

if (configuration->partition_columns_in_data_file)
{
file_columns = columns.getAllPhysical();
}
else
{
std::unordered_set<String> hive_partition_columns_to_read_from_file_path_set;

for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path)
{
hive_partition_columns_to_read_from_file_path_set.insert(name);
}

for (const auto & [name, type] : columns.getAllPhysical())
{
if (!hive_partition_columns_to_read_from_file_path_set.contains(name))
{
file_columns.emplace_back(name, type);
}
}
}

return {hive_partition_columns_to_read_from_file_path, file_columns};
}

HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForFileURLLikeStorage(
ColumnsDescription & columns,
const std::string & sample_path,
bool inferred_schema,
std::optional<FormatSettings> format_settings,
ContextPtr context)
{
NamesAndTypesList hive_partition_columns_to_read_from_file_path;
NamesAndTypesList file_columns;

if (context->getSettingsRef()[Setting::use_hive_partitioning])
{
throw Exception(
ErrorCodes::INCORRECT_DATA,
"A hive partitioned file can't contain only partition columns. Try reading it with `use_hive_partitioning=0`");
extractPartitionColumnsFromPathAndEnrichStorageColumns(
columns,
hive_partition_columns_to_read_from_file_path,
sample_path,
inferred_schema,
format_settings,
context);
}

sanityCheckSchemaAndHivePartitionColumns(hive_partition_columns_to_read_from_file_path, columns);

/// Partition strategy is not implemented for File/URL storages,
/// so there is no option to set whether hive partition columns are in the data file or not.
file_columns = columns.getAllPhysical();

return {hive_partition_columns_to_read_from_file_path, file_columns};
}

}
Expand Down
23 changes: 17 additions & 6 deletions src/Storages/HivePartitioningUtils.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#pragma once

#include <absl/container/flat_hash_map.h>
#include <Storages/ColumnsDescription.h>
#include <Storages/ObjectStorage/StorageObjectStorage.h>
#include <Core/NamesAndTypes.h>

namespace DB
{
Expand All @@ -10,7 +11,7 @@ class Chunk;

namespace HivePartitioningUtils
{
using HivePartitioningKeysAndValues = absl::flat_hash_map<std::string_view, std::string_view>;
using HivePartitioningKeysAndValues = std::map<std::string_view, std::string_view>;

HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const std::string & path);

Expand All @@ -19,10 +20,20 @@ void addPartitionColumnsToChunk(
const NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
const std::string & path);

void extractPartitionColumnsFromPathAndEnrichStorageColumns(
ColumnsDescription & storage_columns,
NamesAndTypesList & hive_partition_columns_to_read_from_file_path,
const std::string & path,
/// Hive partition columns and file columns (Note that file columns might not contain the hive partition columns)
using HivePartitionColumnsWithFileColumnsPair = std::pair<NamesAndTypesList, NamesAndTypesList>;

HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForObjectStorage(
ColumnsDescription & columns,
const StorageObjectStorage::ConfigurationPtr & configuration,
const std::string & sample_path,
bool inferred_schema,
std::optional<FormatSettings> format_settings,
ContextPtr context);

HivePartitionColumnsWithFileColumnsPair setupHivePartitioningForFileURLLikeStorage(
ColumnsDescription & columns,
const std::string & sample_path,
bool inferred_schema,
std::optional<FormatSettings> format_settings,
ContextPtr context);
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/IPartitionStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ namespace

if (file_format.empty() || file_format == "auto")
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning");
throw Exception(ErrorCodes::BAD_ARGUMENTS, "File format can't be empty for hive style partitioning");
}

const auto partition_key_description = KeyDescription::getKeyFromAST(partition_by, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context);
Expand Down
61 changes: 8 additions & 53 deletions src/Storages/ObjectStorage/StorageObjectStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,56 +172,13 @@ StorageObjectStorage::StorageObjectStorage(
}
}

/*
* If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression.
* There is no need to read from the file's path.
*
* Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path.
* And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns
*/
if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE)
{
hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns();
}
else if (context->getSettingsRef()[Setting::use_hive_partitioning])
{
HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns(
columns,
hive_partition_columns_to_read_from_file_path,
sample_path,
columns_in_table_or_function_definition.empty(),
format_settings,
context);
}

if (hive_partition_columns_to_read_from_file_path.size() == columns.size())
{
throw Exception(
ErrorCodes::INCORRECT_DATA,
"A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`");
}

if (configuration->partition_columns_in_data_file)
{
file_columns = columns;
}
else
{
std::unordered_set<String> hive_partition_columns_to_read_from_file_path_set;

for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path)
{
hive_partition_columns_to_read_from_file_path_set.insert(name);
}

for (const auto & [name, type] : columns.getAllPhysical())
{
if (!hive_partition_columns_to_read_from_file_path_set.contains(name))
{
file_columns.add({name, type});
}
}
}
std::tie(hive_partition_columns_to_read_from_file_path, file_columns) = HivePartitioningUtils::setupHivePartitioningForObjectStorage(
columns,
configuration,
sample_path,
columns_in_table_or_function_definition.empty(),
format_settings,
context);

// Assert file contains at least one column. The assertion only takes place if we were able to deduce the schema. The storage might be empty.
if (!columns.empty() && file_columns.empty())
Expand Down Expand Up @@ -500,15 +457,13 @@ void StorageObjectStorage::read(
getName());
}

auto all_file_columns = file_columns.getAll();

auto read_from_format_info = configuration->prepareReadingFromFormat(
object_storage,
column_names,
storage_snapshot,
supportsSubsetOfColumns(local_context),
local_context,
PrepareReadingFromFormatHiveParams { all_file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() });
PrepareReadingFromFormatHiveParams { file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() });

const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty())
&& local_context->getSettingsRef()[Setting::optimize_count_from_files];
Expand Down
2 changes: 1 addition & 1 deletion src/Storages/ObjectStorage/StorageObjectStorage.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ class StorageObjectStorage : public IStorage
bool update_configuration_on_read_write = true;

NamesAndTypesList hive_partition_columns_to_read_from_file_path;
ColumnsDescription file_columns;
NamesAndTypesList file_columns;

LoggerPtr log;
};
Expand Down
40 changes: 8 additions & 32 deletions src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ namespace Setting
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int INCORRECT_DATA;
}

String StorageObjectStorageCluster::getPathSample(ContextPtr context)
Expand Down Expand Up @@ -88,37 +87,14 @@ StorageObjectStorageCluster::StorageObjectStorageCluster(
if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning] && !configuration->isDataLakeConfiguration() && !configuration->partition_strategy)
sample_path = getPathSample(context_);

/*
* If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression.
* There is no need to read from the filepath.
*
* Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path.
* And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns
*/
if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE)
{
hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns();
}
else if (context_->getSettingsRef()[Setting::use_hive_partitioning])
{
HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns(
columns,
hive_partition_columns_to_read_from_file_path,
sample_path,
columns_in_table_or_function_definition.empty(),
std::nullopt,
context_
);
}

if (hive_partition_columns_to_read_from_file_path.size() == columns.size())
{
throw Exception(
ErrorCodes::INCORRECT_DATA,
"A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`");
}

/// Hive: Not building the file_columns like `StorageObjectStorage` does because it is not necessary to do it here.
/// Not grabbing the file_columns because it is not necessary to do it here.
std::tie(hive_partition_columns_to_read_from_file_path, std::ignore) = HivePartitioningUtils::setupHivePartitioningForObjectStorage(
columns,
configuration,
sample_path,
columns_in_table_or_function_definition.empty(),
std::nullopt,
context_);

StorageInMemoryMetadata metadata;
metadata.setColumns(columns);
Expand Down
Loading
Loading