From 7308781e34db04a22c50465b826ba98a4348846f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 21 Aug 2025 09:12:15 -0300 Subject: [PATCH] Revert "Antalya 25.6.5: Object storage hive reads & writes" --- .../integrations/azureBlobStorage.md | 33 +- .../engines/table-engines/integrations/s3.md | 50 +-- .../table-functions/azureBlobStorage.md | 53 +-- docs/en/sql-reference/table-functions/file.md | 2 +- docs/en/sql-reference/table-functions/hdfs.md | 2 +- docs/en/sql-reference/table-functions/s3.md | 54 +-- docs/en/sql-reference/table-functions/url.md | 2 +- src/Core/NamesAndTypes.cpp | 9 - src/Core/NamesAndTypes.h | 3 - src/Functions/CMakeLists.txt | 1 - src/Functions/generateSnowflakeID.cpp | 8 - src/Functions/generateSnowflakeID.h | 10 - src/Interpreters/InterpreterInsertQuery.cpp | 2 - src/Storages/Hive/StorageHive.cpp | 2 +- src/Storages/HivePartitioningUtils.cpp | 158 -------- src/Storages/HivePartitioningUtils.h | 32 -- src/Storages/IPartitionStrategy.cpp | 377 ------------------ src/Storages/IPartitionStrategy.h | 128 ------ .../ObjectStorage/Azure/Configuration.cpp | 182 +-------- .../ObjectStorage/Azure/Configuration.h | 25 +- .../ObjectStorage/DataLakes/Common.cpp | 2 +- .../DataLakes/DataLakeConfiguration.h | 3 +- .../DataLakes/DeltaLake/KernelHelper.cpp | 2 +- .../DataLakes/DeltaLakeMetadata.cpp | 15 +- .../DataLakes/Iceberg/IcebergMetadata.cpp | 27 +- .../ObjectStorage/HDFS/Configuration.cpp | 19 +- .../ObjectStorage/HDFS/Configuration.h | 19 +- .../ObjectStorage/Local/Configuration.h | 3 +- .../ObjectStorage/S3/Configuration.cpp | 99 +---- src/Storages/ObjectStorage/S3/Configuration.h | 15 +- .../ObjectStorage/StorageObjectStorage.cpp | 266 +++--------- .../ObjectStorage/StorageObjectStorage.h | 70 +--- .../StorageObjectStorageCluster.cpp | 57 +-- .../StorageObjectStorageCluster.h | 6 +- .../StorageObjectStorageSink.cpp | 73 ++-- .../ObjectStorage/StorageObjectStorageSink.h | 6 +- .../StorageObjectStorageSource.cpp | 85 ++-- .../StorageObjectStorageSource.h | 3 - .../ObjectStorageQueueSource.cpp | 14 +- .../StorageObjectStorageQueue.cpp | 13 +- src/Storages/PartitionedSink.cpp | 38 +- src/Storages/PartitionedSink.h | 12 +- src/Storages/StorageFile.cpp | 75 +--- src/Storages/StorageFile.h | 4 - src/Storages/StorageFileCluster.cpp | 27 +- src/Storages/StorageFileCluster.h | 1 - src/Storages/StorageURL.cpp | 79 +--- src/Storages/StorageURL.h | 5 +- src/Storages/StorageURLCluster.cpp | 19 +- src/Storages/StorageURLCluster.h | 1 - src/Storages/VirtualColumnUtils.cpp | 120 +++++- src/Storages/VirtualColumnUtils.h | 21 +- src/Storages/checkAndGetLiteralArgument.cpp | 27 -- src/Storages/checkAndGetLiteralArgument.h | 3 - src/Storages/prepareReadingFromFormat.cpp | 16 +- src/Storages/prepareReadingFromFormat.h | 18 +- ...ils.cpp => gtest_virtual_column_utils.cpp} | 26 +- src/TableFunctions/ITableFunction.h | 4 - .../TableFunctionObjectStorage.cpp | 7 +- .../TableFunctionObjectStorage.h | 6 - .../TableFunctionObjectStorageCluster.cpp | 3 +- .../test_storage_azure_blob_storage/test.py | 77 +--- tests/integration/test_storage_s3/test.py | 2 +- .../0_stateless/01944_insert_partition_by.sql | 2 + .../03203_hive_style_partitioning.reference | 142 +++---- .../03203_hive_style_partitioning.sh | 22 +- .../03363_hive_style_partition.reference | 36 -- .../03363_hive_style_partition.sql | 124 ------ ...3_globbed_path_in_bucket_portion.reference | 0 ...3364_s3_globbed_path_in_bucket_portion.sql | 30 -- ...528_s3_insert_partition_by_whitespaces.sql | 13 +- ...on_by_require_partition_wildcard.reference | 0 ...artition_by_require_partition_wildcard.sql | 3 - 73 files changed, 590 insertions(+), 2303 deletions(-) delete mode 100644 src/Functions/generateSnowflakeID.h delete mode 100644 src/Storages/HivePartitioningUtils.cpp delete mode 100644 src/Storages/HivePartitioningUtils.h delete mode 100644 src/Storages/IPartitionStrategy.cpp delete mode 100644 src/Storages/IPartitionStrategy.h rename src/Storages/tests/{gtest_hive_partitioning_utils.cpp => gtest_virtual_column_utils.cpp} (66%) delete mode 100644 tests/queries/0_stateless/03363_hive_style_partition.reference delete mode 100644 tests/queries/0_stateless/03363_hive_style_partition.sql delete mode 100644 tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.reference delete mode 100644 tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql delete mode 100644 tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.reference delete mode 100644 tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.sql diff --git a/docs/en/engines/table-engines/integrations/azureBlobStorage.md b/docs/en/engines/table-engines/integrations/azureBlobStorage.md index a174b2029298..5640e57df3e9 100644 --- a/docs/en/engines/table-engines/integrations/azureBlobStorage.md +++ b/docs/en/engines/table-engines/integrations/azureBlobStorage.md @@ -14,7 +14,7 @@ This engine provides an integration with [Azure Blob Storage](https://azure.micr ```sql CREATE TABLE azure_blob_storage_table (name String, value UInt32) - ENGINE = AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, partition_strategy, partition_columns_in_data_file]) + ENGINE = AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression]) [PARTITION BY expr] [SETTINGS ...] ``` @@ -30,8 +30,6 @@ CREATE TABLE azure_blob_storage_table (name String, value UInt32) - `account_key` - if storage_account_url is used, then account key can be specified here - `format` — The [format](/interfaces/formats.md) of the file. - `compression` — Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`). -- `partition_strategy` – Options: `WILDCARD` or `HIVE`. `WILDCARD` requires a `{_partition_id}` in the path, which is replaced with the partition key. `HIVE` does not allow wildcards, assumes the path is the table root, and generates Hive-style partitioned directories with Snowflake IDs as filenames and the file format as the extension. Defaults to `WILDCARD` -- `partition_columns_in_data_file` - Only used with `HIVE` partition strategy. Tells ClickHouse whether to expect partition columns to be written in the data file. Defaults `false`. **Example** @@ -98,35 +96,6 @@ SETTINGS filesystem_cache_name = 'cache_for_azure', enable_filesystem_cache = 1; 2. reuse cache configuration (and therefore cache storage) from clickhouse `storage_configuration` section, [described here](/operations/storing-data.md/#using-local-cache) -### PARTITION BY {#partition-by} - -`PARTITION BY` — Optional. In most cases you don't need a partition key, and if it is needed you generally don't need a partition key more granular than by month. Partitioning does not speed up queries (in contrast to the ORDER BY expression). You should never use too granular partitioning. Don't partition your data by client identifiers or names (instead, make client identifier or name the first column in the ORDER BY expression). - -For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format. - -#### Partition strategy {#partition-strategy} - -`WILDCARD` (default): Replaces the `{_partition_id}` wildcard in the file path with the actual partition key. Reading is not supported. - -`HIVE` implements hive style partitioning for reads & writes. Reading is implemented using a recursive glob pattern. Writing generates files using the following format: `//.`. - -Note: When using `HIVE` partition strategy, the `use_hive_partitioning` setting has no effect. - -Example of `HIVE` partition strategy: - -```sql -arthur :) create table azure_table (year UInt16, country String, counter UInt8) ENGINE=AzureBlobStorage(account_name='devstoreaccount1', account_key='Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', storage_account_url = 'http://localhost:30000/devstoreaccount1', container='cont', blob_path='hive_partitioned', format='Parquet', compression='auto', partition_strategy='hive') PARTITION BY (year, country); - -arthur :) insert into azure_table values (2020, 'Russia', 1), (2021, 'Brazil', 2); - -arthur :) select _path, * from azure_table; - - ┌─_path──────────────────────────────────────────────────────────────────────┬─year─┬─country─┬─counter─┐ -1. │ cont/hive_partitioned/year=2020/country=Russia/7351305360873664512.parquet │ 2020 │ Russia │ 1 │ -2. │ cont/hive_partitioned/year=2021/country=Brazil/7351305360894636032.parquet │ 2021 │ Brazil │ 2 │ - └────────────────────────────────────────────────────────────────────────────┴──────┴─────────┴─────────┘ -``` - ## See also {#see-also} [Azure Blob Storage Table Function](/sql-reference/table-functions/azureBlobStorage) diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 934af30f34b7..456482a56145 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -34,7 +34,7 @@ SELECT * FROM s3_engine_table LIMIT 2; ```sql CREATE TABLE s3_engine_table (name String, value UInt32) - ENGINE = S3(path [, NOSIGN | aws_access_key_id, aws_secret_access_key,] format, [compression], [partition_strategy], [partition_columns_in_data_file]) + ENGINE = S3(path [, NOSIGN | aws_access_key_id, aws_secret_access_key,] format, [compression]) [PARTITION BY expr] [SETTINGS ...] ``` @@ -46,8 +46,6 @@ CREATE TABLE s3_engine_table (name String, value UInt32) - `format` — The [format](/sql-reference/formats#formats-overview) of the file. - `aws_access_key_id`, `aws_secret_access_key` - Long-term credentials for the [AWS](https://aws.amazon.com/) account user. You can use these to authenticate your requests. Parameter is optional. If credentials are not specified, they are used from the configuration file. For more information see [Using S3 for Data Storage](../mergetree-family/mergetree.md#table_engine-mergetree-s3). - `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will auto-detect compression by file extension. -- `partition_strategy` – Options: `WILDCARD` or `HIVE`. `WILDCARD` requires a `{_partition_id}` in the path, which is replaced with the partition key. `HIVE` does not allow wildcards, assumes the path is the table root, and generates Hive-style partitioned directories with Snowflake IDs as filenames and the file format as the extension. Defaults to `WILDCARD` -- `partition_columns_in_data_file` - Only used with `HIVE` partition strategy. Tells ClickHouse whether to expect partition columns to be written in the data file. Defaults `false`. ### Data cache {#data-cache} @@ -86,52 +84,6 @@ There are two ways to define cache in configuration file. For partitioning by month, use the `toYYYYMM(date_column)` expression, where `date_column` is a column with a date of the type [Date](/sql-reference/data-types/date.md). The partition names here have the `"YYYYMM"` format. -#### Partition strategy {#partition-strategy} - -`WILDCARD` (default): Replaces the `{_partition_id}` wildcard in the file path with the actual partition key. Reading is not supported. - -`HIVE` implements hive style partitioning for reads & writes. Reading is implemented using a recursive glob pattern, it is equivalent to `SELECT * FROM s3('table_root/**.parquet')`. -Writing generates files using the following format: `//.`. - -Note: When using `HIVE` partition strategy, the `use_hive_partitioning` setting has no effect. - -Example of `HIVE` partition strategy: - -```sql -arthur :) CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partition_strategy='hive') -PARTITION BY (year, country); - -arthur :) INSERT INTO t_03363_parquet VALUES - (2022, 'USA', 1), - (2022, 'Canada', 2), - (2023, 'USA', 3), - (2023, 'Mexico', 4), - (2024, 'France', 5), - (2024, 'Germany', 6), - (2024, 'Germany', 7), - (1999, 'Brazil', 8), - (2100, 'Japan', 9), - (2024, 'CN', 10), - (2025, '', 11); - -arthur :) select _path, * from t_03363_parquet; - - ┌─_path──────────────────────────────────────────────────────────────────────┬─year─┬─country─┬─counter─┐ - 1. │ test/t_03363_parquet/year=2100/country=Japan/7329604473272971264.parquet │ 2100 │ Japan │ 9 │ - 2. │ test/t_03363_parquet/year=2024/country=France/7329604473323302912.parquet │ 2024 │ France │ 5 │ - 3. │ test/t_03363_parquet/year=2022/country=Canada/7329604473314914304.parquet │ 2022 │ Canada │ 2 │ - 4. │ test/t_03363_parquet/year=1999/country=Brazil/7329604473289748480.parquet │ 1999 │ Brazil │ 8 │ - 5. │ test/t_03363_parquet/year=2023/country=Mexico/7329604473293942784.parquet │ 2023 │ Mexico │ 4 │ - 6. │ test/t_03363_parquet/year=2023/country=USA/7329604473319108608.parquet │ 2023 │ USA │ 3 │ - 7. │ test/t_03363_parquet/year=2025/country=/7329604473327497216.parquet │ 2025 │ │ 11 │ - 8. │ test/t_03363_parquet/year=2024/country=CN/7329604473310720000.parquet │ 2024 │ CN │ 10 │ - 9. │ test/t_03363_parquet/year=2022/country=USA/7329604473298137088.parquet │ 2022 │ USA │ 1 │ -10. │ test/t_03363_parquet/year=2024/country=Germany/7329604473306525696.parquet │ 2024 │ Germany │ 6 │ -11. │ test/t_03363_parquet/year=2024/country=Germany/7329604473306525696.parquet │ 2024 │ Germany │ 7 │ - └────────────────────────────────────────────────────────────────────────────┴──────┴─────────┴─────────┘ -``` - ### Querying partitioned data {#querying-partitioned-data} This example uses the [docker compose recipe](https://github.com/ClickHouse/examples/tree/5fdc6ff72f4e5137e23ea075c88d3f44b0202490/docker-compose-recipes/recipes/ch-and-minio-S3), which integrates ClickHouse and MinIO. You should be able to reproduce the same queries using S3 by replacing the endpoint and authentication values. diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index 3d3c66b0b676..6ab4de94cdaa 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -18,23 +18,21 @@ Provides a table-like interface to select/insert files in [Azure Blob Storage](h ## Syntax {#syntax} ```sql -azureBlobStorage(- connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure, partition_strategy, partition_columns_in_data_file]) +azureBlobStorage(- connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure]) ``` ## Arguments {#arguments} -| Argument | Description | -|---------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `connection_string`\| `storage_account_url` | connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key) | -| `container_name` | Container name | -| `blobpath` | file path. Supports following wildcards in readonly mode: `*`, `**`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. | -| `account_name` | if storage_account_url is used, then account name can be specified here | -| `account_key` | if storage_account_url is used, then account key can be specified here | -| `format` | The [format](/sql-reference/formats) of the file. | -| `compression` | Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`). | -| `structure` | Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. | -| `partition_strategy` | Parameter is optional. Supported values: `WILDCARD` or `HIVE`. `WILDCARD` requires a `{_partition_id}` in the path, which is replaced with the partition key. `HIVE` does not allow wildcards, assumes the path is the table root, and generates Hive-style partitioned directories with Snowflake IDs as filenames and the file format as the extension. Defaults to `WILDCARD` | -| `partition_columns_in_data_file` | Parameter is optional. Only used with `HIVE` partition strategy. Tells ClickHouse whether to expect partition columns to be written in the data file. Defaults `false`. | +| Argument | Description | +|--------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `connection_string`\| `storage_account_url` | connection_string includes account name & key ([Create connection string](https://learn.microsoft.com/en-us/azure/storage/common/storage-configure-connection-string?toc=%2Fazure%2Fstorage%2Fblobs%2Ftoc.json&bc=%2Fazure%2Fstorage%2Fblobs%2Fbreadcrumb%2Ftoc.json#configure-a-connection-string-for-an-azure-storage-account)) or you could also provide the storage account url here and account name & account key as separate parameters (see parameters account_name & account_key)| +| `container_name` | Container name | +| `blobpath` | file path. Supports following wildcards in readonly mode: `*`, `**`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. | +| `account_name` | if storage_account_url is used, then account name can be specified here | +| `account_key` | if storage_account_url is used, then account key can be specified here | +| `format` | The [format](/sql-reference/formats) of the file. | +| `compression` | Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. (same as setting to `auto`). | +| `structure` | Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. | ## Returned value {#returned_value} @@ -86,34 +84,7 @@ SELECT count(*) FROM azureBlobStorage('DefaultEndpointsProtocol=https;AccountNam - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. -## Partitioned Write {#partitioned-write} - -### Partition Strategy {#partition-strategy} - -Supported for INSERT queries only. - -`WILDCARD` (default): Replaces the `{_partition_id}` wildcard in the file path with the actual partition key. - -`HIVE` implements hive style partitioning for reads & writes. It generates files using the following format: `//.`. - -**Example of `HIVE` partition strategy** - -```sql -INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = 'http://localhost:30000/devstoreaccount1', container='cont', blob_path='azure_table_root', format='CSVWithNames', compression='auto', structure='year UInt16, country String, id Int32', partition_strategy='hive') PARTITION BY (year, country) VALUES (2020, 'Russia', 1), (2021, 'Brazil', 2); -``` - -```result -select _path, * from azureBlobStorage(azure_conf2, storage_account_url = 'http://localhost:30000/devstoreaccount1', container='cont', blob_path='azure_table_root/**.csvwithnames') - - ┌─_path───────────────────────────────────────────────────────────────────────────┬─id─┬─year─┬─country─┐ -1. │ cont/azure_table_root/year=2021/country=Brazil/7351307847391293440.csvwithnames │ 2 │ 2021 │ Brazil │ -2. │ cont/azure_table_root/year=2020/country=Russia/7351307847378710528.csvwithnames │ 1 │ 2020 │ Russia │ - └─────────────────────────────────────────────────────────────────────────────────┴────┴──────┴─────────┘ -``` - -## use_hive_partitioning setting {#hive-style-partitioning} - -This is a hint for ClickHouse to parse hive style partitioned files upon reading time. It has no effect on writing. For symmetrical reads and writes, use the `partition_strategy` argument. +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index b7d72e7f6ff5..f14be49ab27c 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -217,7 +217,7 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3 - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. -## use_hive_partitioning setting {#hive-style-partitioning} +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index a801421c531a..6d88b08a7d88 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -110,7 +110,7 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. -## use_hive_partitioning setting {#hive-style-partitioning} +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 323f0618f96b..aaa237057c62 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -23,7 +23,7 @@ When using the `s3 table function` with [`INSERT INTO...SELECT`](../../sql-refer ## Syntax {#syntax} ```sql -s3(url [, NOSIGN | access_key_id, secret_access_key, [session_token]] [,format] [,structure] [,compression_method],[,headers], [,partition_strategy], [,partition_columns_in_data_file]) +s3(url [, NOSIGN | access_key_id, secret_access_key, [session_token]] [,format] [,structure] [,compression_method],[,headers]) s3(named_collection[, option=value [,..]]) ``` @@ -37,18 +37,16 @@ For GCS, substitute your HMAC key and HMAC secret where you see `access_key_id` `s3` table function supports the following plain parameters: -| Parameter | Description | -|-----------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `url` | Bucket url with path to file. Supports following wildcards in readonly mode: `*`, `**`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. For more information see [here](../../engines/table-engines/integrations/s3.md#wildcards-in-path). | -| `NOSIGN` | If this keyword is provided in place of credentials, all the requests will not be signed. | -| `access_key_id` and `secret_access_key` | Keys that specify credentials to use with given endpoint. Optional. | -| `session_token` | Session token to use with the given keys. Optional when passing keys. | -| `format` | The [format](/sql-reference/formats) of the file. | -| `structure` | Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. | -| `compression_method` | Parameter is optional. Supported values: `none`, `gzip` or `gz`, `brotli` or `br`, `xz` or `LZMA`, `zstd` or `zst`. By default, it will autodetect compression method by file extension. | -| `headers` | Parameter is optional. Allows headers to be passed in the S3 request. Pass in the format `headers(key=value)` e.g. `headers('x-amz-request-payer' = 'requester')`. | -| `partition_strategy` | Parameter is optional. Supported values: `WILDCARD` or `HIVE`. `WILDCARD` requires a `{_partition_id}` in the path, which is replaced with the partition key. `HIVE` does not allow wildcards, assumes the path is the table root, and generates Hive-style partitioned directories with Snowflake IDs as filenames and the file format as the extension. Defaults to `WILDCARD` | -| `partition_columns_in_data_file` | Parameter is optional. Only used with `HIVE` partition strategy. Tells ClickHouse whether to expect partition columns to be written in the data file. Defaults `false`. | +| Parameter | Description | +|-----------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `url` | Bucket url with path to file. Supports following wildcards in readonly mode: `*`, `**`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. For more information see [here](../../engines/table-engines/integrations/s3.md#wildcards-in-path). | +| `NOSIGN` | If this keyword is provided in place of credentials, all the requests will not be signed. | +| `access_key_id` and `secret_access_key` | Keys that specify credentials to use with given endpoint. Optional. | +| `session_token` | Session token to use with the given keys. Optional when passing keys. | +| `format` | The [format](/sql-reference/formats) of the file. | +| `structure` | Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. | +| `compression_method` | Parameter is optional. Supported values: `none`, `gzip` or `gz`, `brotli` or `br`, `xz` or `LZMA`, `zstd` or `zst`. By default, it will autodetect compression method by file extension. | +| `headers` | Parameter is optional. Allows headers to be passed in the S3 request. Pass in the format `headers(key=value)` e.g. `headers('x-amz-request-payer' = 'requester')`. | :::note GCS The GCS url is in this format as the endpoint for the Google XML API is different than the JSON API: @@ -226,29 +224,9 @@ FROM s3(creds, url='https://s3-object-url.csv') ## Partitioned Write {#partitioned-write} -### Partition Strategy {#partition-strategy} +If you specify `PARTITION BY` expression when inserting data into `S3` table, a separate file is created for each partition value. Splitting the data into separate files helps to improve reading operations efficiency. -Supported for INSERT queries only. - -`WILDCARD` (default): Replaces the `{_partition_id}` wildcard in the file path with the actual partition key. - -`HIVE` implements hive style partitioning for reads & writes. It generates files using the following format: `//.`. - -**Example of `HIVE` partition strategy** - -```sql -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partition_strategy='hive') PARTITION BY (year, country) SELECT 2020 as year, 'Russia' as country, 1 as id; -``` - -```result -SELECT _path, * FROM s3(s3_conn, filename='t_03363_function/**.parquet'); - - ┌─_path──────────────────────────────────────────────────────────────────────┬─id─┬─country─┬─year─┐ -1. │ test/t_03363_function/year=2020/country=Russia/7351295896279887872.parquet │ 1 │ Russia │ 2020 │ - └────────────────────────────────────────────────────────────────────────────┴────┴─────────┴──────┘ -``` - -**Examples of `WILDCARD` partition strategy** +**Examples** 1. Using partition ID in a key creates separate files: @@ -340,14 +318,14 @@ Note that rows can only be inserted into new files. There are no merge cycles or - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. -## use_hive_partitioning setting {#hive-style-partitioning} - -This is a hint for ClickHouse to parse hive style partitioned files upon reading time. It has no effect on writing. For symmetrical reads and writes, use the `partition_strategy` argument. +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. **Example** +Use virtual column, created with Hive-style partitioning + ```sql SELECT * from s3('s3://data/path/date=*/country=*/code=*/*.parquet') where date > '2020-01-01' and country = 'Netherlands' and code = 42; ``` diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index a4126fb5cd9d..9c2b68af5b08 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -63,7 +63,7 @@ Character `|` inside patterns is used to specify failover addresses. They are it - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. - `_headers` - HTTP response headers. Type: `Map(LowCardinality(String), LowCardinality(String))`. -## use_hive_partitioning setting {#hive-style-partitioning} +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index a7379775b585..eb93685570f5 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -172,15 +172,6 @@ NameSet NamesAndTypesList::getNameSet() const return res; } -std::unordered_map NamesAndTypesList::getNameToTypeMap() const -{ - std::unordered_map res; - res.reserve(size()); - for (const NameAndTypePair & column : *this) - res.emplace(column.name, column.type); - return res; -} - DataTypes NamesAndTypesList::getTypes() const { DataTypes res; diff --git a/src/Core/NamesAndTypes.h b/src/Core/NamesAndTypes.h index 1ae4d8ff93ba..61db974fa8ba 100644 --- a/src/Core/NamesAndTypes.h +++ b/src/Core/NamesAndTypes.h @@ -101,9 +101,6 @@ class NamesAndTypesList : public std::list NameSet getNameSet() const; DataTypes getTypes() const; - /// Creates a mapping from name to the type - std::unordered_map getNameToTypeMap() const; - /// Remove columns which names are not in the `names`. void filterColumns(const NameSet & names); diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index e7492b9bc511..9009bdeccea5 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -23,7 +23,6 @@ set(DBMS_FUNCTIONS FunctionsConversion_impl2.cpp FunctionsConversion_impl3.cpp extractTimeZoneFromFunctionArguments.cpp # extractTimeZoneFromFunctionArguments (DateTimeTransforms.h, FunctionsConversion.cpp) - generateSnowflakeID.cpp ) extract_into_parent_list(clickhouse_functions_sources dbms_sources ${DBMS_FUNCTIONS}) extract_into_parent_list(clickhouse_functions_headers dbms_headers diff --git a/src/Functions/generateSnowflakeID.cpp b/src/Functions/generateSnowflakeID.cpp index 0fcb526272f3..40d9dee2c0ef 100644 --- a/src/Functions/generateSnowflakeID.cpp +++ b/src/Functions/generateSnowflakeID.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -148,13 +147,6 @@ struct Data } -uint64_t generateSnowflakeID() -{ - Data data; - SnowflakeId snowflake_id = data.reserveRange(getMachineId(), 1); - return fromSnowflakeId(snowflake_id); -} - class FunctionGenerateSnowflakeID : public IFunction { public: diff --git a/src/Functions/generateSnowflakeID.h b/src/Functions/generateSnowflakeID.h deleted file mode 100644 index 38fa684a9b4b..000000000000 --- a/src/Functions/generateSnowflakeID.h +++ /dev/null @@ -1,10 +0,0 @@ -#pragma once - -#include - -namespace DB -{ - -uint64_t generateSnowflakeID(); - -} diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp index 32f0cddf4f05..04f7dbbabde2 100644 --- a/src/Interpreters/InterpreterInsertQuery.cpp +++ b/src/Interpreters/InterpreterInsertQuery.cpp @@ -147,8 +147,6 @@ StoragePtr InterpreterInsertQuery::getTable(ASTInsertQuery & query) table_function_ptr->setStructureHint(structure_hint); } - table_function_ptr->setPartitionBy(query.partition_by); - return table_function_ptr->execute(query.table_function, current_context, table_function_ptr->getName(), /* cached_columns */ {}, /* use_global_context */ false, /* is_insert_query */true); } diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 1289aaf58d04..8db337f7ff2f 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -441,7 +441,7 @@ StorageHive::StorageHive( storage_metadata.setComment(comment_); storage_metadata.partition_key = KeyDescription::getKeyFromAST(partition_by_ast, storage_metadata.columns, getContext()); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, getContext())); setInMemoryMetadata(storage_metadata); } diff --git a/src/Storages/HivePartitioningUtils.cpp b/src/Storages/HivePartitioningUtils.cpp deleted file mode 100644 index a7496cb98cc0..000000000000 --- a/src/Storages/HivePartitioningUtils.cpp +++ /dev/null @@ -1,158 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace Setting -{ - extern const SettingsBool use_hive_partitioning; -} - -namespace ErrorCodes -{ - extern const int INCORRECT_DATA; -} - -namespace HivePartitioningUtils -{ - -static auto makeExtractor() -{ - return KeyValuePairExtractorBuilder().withItemDelimiters({'/'}).withKeyValueDelimiter('=').buildWithReferenceMap(); -} - -HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const String & path) -{ - static auto extractor = makeExtractor(); - - HivePartitioningKeysAndValues key_values; - - // cutting the filename to prevent malformed filenames that contain key-value-pairs from being extracted - // not sure if we actually need to do that, but just in case. Plus, the previous regex impl took care of it - const auto last_slash_pos = path.find_last_of('/'); - - if (last_slash_pos == std::string::npos) - { - // nothing to extract, there is no path, just a filename - return key_values; - } - - std::string_view path_without_filename(path.data(), last_slash_pos); - - try - { - extractor.extract(path_without_filename, key_values); - } - catch (const extractKV::DuplicateKeyFoundException & ex) - { - throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {} with different values, only unique keys are allowed", path, ex.key); - } - - return key_values; -} -NamesAndTypesList extractHivePartitionColumnsFromPath( - const ColumnsDescription & storage_columns, - const std::string & sample_path, - const std::optional & format_settings, - const ContextPtr & context) -{ - NamesAndTypesList hive_partition_columns_to_read_from_file_path; - - const auto hive_map = parseHivePartitioningKeysAndValues(sample_path); - - for (const auto & item : hive_map) - { - const std::string key(item.first); - const std::string value(item.second); - - // if we know the type from the schema, use it. - if (storage_columns.has(key)) - { - hive_partition_columns_to_read_from_file_path.emplace_back(key, storage_columns.get(key).type); - } - else - { - if (const auto type = tryInferDataTypeByEscapingRule(value, format_settings ? *format_settings : getFormatSettings(context), FormatSettings::EscapingRule::Raw)) - { - hive_partition_columns_to_read_from_file_path.emplace_back(key, type); - } - else - { - hive_partition_columns_to_read_from_file_path.emplace_back(key, std::make_shared()); - } - } - } - - return hive_partition_columns_to_read_from_file_path; -} - -void addPartitionColumnsToChunk( - Chunk & chunk, - const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, - const std::string & path) -{ - const auto hive_map = parseHivePartitioningKeysAndValues(path); - - for (const auto & column : hive_partition_columns_to_read_from_file_path) - { - const std::string column_name = column.getNameInStorage(); - const auto it = hive_map.find(column_name); - - if (it == hive_map.end()) - { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "Expected to find hive partitioning column {} in the path {}." - "Try it with hive partitioning disabled (partition_strategy='wildcard' and/or use_hive_partitioning=0", - column_name, - path); - } - - auto chunk_column = column.type->createColumnConst(chunk.getNumRows(), convertFieldToType(Field(it->second), *column.type))->convertToFullColumnIfConst(); - chunk.addColumn(std::move(chunk_column)); - } -} - -void extractPartitionColumnsFromPathAndEnrichStorageColumns( - ColumnsDescription & storage_columns, - NamesAndTypesList & hive_partition_columns_to_read_from_file_path, - const std::string & path, - bool inferred_schema, - std::optional format_settings, - ContextPtr context) -{ - hive_partition_columns_to_read_from_file_path = extractHivePartitionColumnsFromPath(storage_columns, path, format_settings, context); - - /// If the structure was inferred (not present in `columns_`), then we might need to enrich the schema with partition columns - /// Because they might not be present in the data and exist only in the path - if (inferred_schema) - { - for (const auto & [name, type]: hive_partition_columns_to_read_from_file_path) - { - if (!storage_columns.has(name)) - { - storage_columns.add({name, type}); - } - } - } - - if (hive_partition_columns_to_read_from_file_path.size() == storage_columns.size()) - { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "A hive partitioned file can't contain only partition columns. Try reading it with `use_hive_partitioning=0`"); - } -} - -} - -} diff --git a/src/Storages/HivePartitioningUtils.h b/src/Storages/HivePartitioningUtils.h deleted file mode 100644 index b532aee67a13..000000000000 --- a/src/Storages/HivePartitioningUtils.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -class Chunk; - -namespace HivePartitioningUtils -{ -using HivePartitioningKeysAndValues = absl::flat_hash_map; - -HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const std::string & path); - -void addPartitionColumnsToChunk( - Chunk & chunk, - const NamesAndTypesList & hive_partition_columns_to_read_from_file_path, - const std::string & path); - -void extractPartitionColumnsFromPathAndEnrichStorageColumns( - ColumnsDescription & storage_columns, - NamesAndTypesList & hive_partition_columns_to_read_from_file_path, - const std::string & path, - bool inferred_schema, - std::optional format_settings, - ContextPtr context); - -} - -} diff --git a/src/Storages/IPartitionStrategy.cpp b/src/Storages/IPartitionStrategy.cpp deleted file mode 100644 index 5ebdb249e661..000000000000 --- a/src/Storages/IPartitionStrategy.cpp +++ /dev/null @@ -1,377 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ -extern const int LOGICAL_ERROR; -extern const int BAD_ARGUMENTS; -} - -namespace -{ - /// Creates Expression actions to create hive path part of format - /// `partition_column_1=toString(partition_value_expr_1)/ ... /partition_column_N=toString(partition_value_expr_N)/` - /// for given partition columns list and a partition by AST. - /// The actions will be computed over chunk to convert partition values to string values. - HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName buildExpressionHive( - ASTPtr partition_by, - const NamesAndTypesList & partition_columns, - const Block & sample_block, - ContextPtr context) - { - HiveStylePartitionStrategy::PartitionExpressionActionsAndColumnName actions_with_column_name; - ASTs concat_args; - - if (const auto * tuple_function = partition_by->as(); - tuple_function && tuple_function->name == "tuple") - { - if (tuple_function->arguments->children.size() != partition_columns.size()) - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "The partition by expression has a different number of columns than what is expected by ClickHouse." - "This is a bug."); - } - - std::size_t index = 0; - - for (const auto & partition_column : partition_columns) - { - const auto & child = tuple_function->arguments->children[index++]; - - concat_args.push_back(std::make_shared(partition_column.name + "=")); - - concat_args.push_back(makeASTFunction("toString", child)); - - concat_args.push_back(std::make_shared("/")); - } - } - else - { - if (partition_columns.size() != 1) - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Expected partition expression to contain a single argument, got {} instead", partition_columns.size()); - } - - ASTs to_string_args = {1, partition_by}; - concat_args.push_back(std::make_shared(partition_columns.front().name + "=")); - concat_args.push_back(makeASTFunction("toString", std::move(to_string_args))); - concat_args.push_back(std::make_shared("/")); - } - - ASTPtr hive_expr = makeASTFunction("concat", std::move(concat_args)); - auto hive_syntax_result = TreeRewriter(context).analyze(hive_expr, sample_block.getNamesAndTypesList()); - actions_with_column_name.actions = ExpressionAnalyzer(hive_expr, hive_syntax_result, context).getActions(false); - actions_with_column_name.column_name = hive_expr->getColumnName(); - - return actions_with_column_name; - } - - Block buildBlockWithoutPartitionColumns( - const Block & sample_block, - const std::unordered_set & partition_expression_required_columns_set) - { - Block result; - for (size_t i = 0; i < sample_block.columns(); i++) - { - if (!partition_expression_required_columns_set.contains(sample_block.getByPosition(i).name)) - { - result.insert(sample_block.getByPosition(i)); - } - } - - return result; - } - - std::shared_ptr createHivePartitionStrategy( - ASTPtr partition_by, - const Block & sample_block, - ContextPtr context, - const std::string & file_format, - bool globbed_path, - bool partition_columns_in_data_file) - { - if (!partition_by) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy hive can not be used without a PARTITION BY expression"); - } - - if (globbed_path) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} can not be used with a globbed path", "hive"); - } - - if (file_format.empty() || file_format == "auto") - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "File format can't be empty for hive style partitioning"); - } - - const auto partition_key_description = KeyDescription::getKeyFromAST(partition_by, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context); - - for (const auto & partition_expression_column : partition_key_description.sample_block) - { - if (!sample_block.has(partition_expression_column.name)) - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Hive partitioning expects that the partition by expression columns are a part of the storage columns, could not find '{}' in storage", - partition_expression_column.name); - } - - const auto & type = partition_expression_column.type; - const bool is_type_supported = isInteger(type) || isDateOrDate32OrTimeOrTime64OrDateTimeOrDateTime64(type) || isStringOrFixedString(type); - - if (!is_type_supported) - { - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "Hive partitioning supports only partition columns of types: Integer, Date, Time, DateTime and String/FixedString. Found '{}'", - type->getName()); - } - } - - return std::make_shared( - partition_key_description, - sample_block, - context, - file_format, - partition_columns_in_data_file); - } - - std::shared_ptr createWildcardPartitionStrategy( - ASTPtr partition_by, - const Block & sample_block, - ContextPtr context, - bool contains_partition_wildcard, - bool partition_columns_in_data_file) - { - if (!partition_by) - { - return nullptr; - } - - /// Backwards incompatible in the sense that `create table s3_table engine=s3('path_without_wildcard') PARTITION BY ... used to be valid, - /// but it is not anymore - if (!contains_partition_wildcard) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy wildcard can not be used without a '_partition_id' wildcard"); - } - - if (!partition_columns_in_data_file) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} can not be used with partition_columns_in_data_file=0", "wildcard"); - } - - return std::make_shared( - KeyDescription::getKeyFromAST(partition_by, ColumnsDescription::fromNamesAndTypes(sample_block.getNamesAndTypes()), context), - sample_block, - context); - } -} - -IPartitionStrategy::IPartitionStrategy(KeyDescription partition_key_description_, const Block & sample_block_, ContextPtr context_) -: partition_key_description(partition_key_description_), sample_block(sample_block_), context(context_) -{ -} - -NamesAndTypesList IPartitionStrategy::getPartitionColumns() const -{ - return partition_key_description.sample_block.getNamesAndTypesList(); -} - -const KeyDescription & IPartitionStrategy::getPartitionKeyDescription() const -{ - return partition_key_description; -} - -std::shared_ptr PartitionStrategyFactory::get(StrategyType strategy, - ASTPtr partition_by, - const NamesAndTypesList & partition_columns, - ContextPtr context, - const std::string & file_format, - bool globbed_path, - bool contains_partition_wildcard, - bool partition_columns_in_data_file) -{ - Block block; - for (const auto & partition_column : partition_columns) - { - block.insert({partition_column.type, partition_column.name}); - } - - switch (strategy) - { - case StrategyType::WILDCARD: - return createWildcardPartitionStrategy( - partition_by, - block, - context, - contains_partition_wildcard, - partition_columns_in_data_file); - case StrategyType::HIVE: - return createHivePartitionStrategy( - partition_by, - block, - context, - file_format, - globbed_path, - partition_columns_in_data_file); - case StrategyType::NONE: - /// Unreachable for plain object storage, used only by Data Lakes for now - return nullptr; - } -} - -WildcardPartitionStrategy::WildcardPartitionStrategy(KeyDescription partition_key_description_, const Block & sample_block_, ContextPtr context_) - : IPartitionStrategy(partition_key_description_, sample_block_, context_) -{ - ASTs arguments(1, partition_key_description_.definition_ast); - ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); - auto syntax_result = TreeRewriter(context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); - actions_with_column_name.actions = ExpressionAnalyzer(partition_by_string, syntax_result, context).getActions(false); - actions_with_column_name.column_name = partition_by_string->getColumnName(); -} - -ColumnPtr WildcardPartitionStrategy::computePartitionKey(const Chunk & chunk) -{ - Block block_with_partition_by_expr = sample_block.cloneWithoutColumns(); - block_with_partition_by_expr.setColumns(chunk.getColumns()); - actions_with_column_name.actions->execute(block_with_partition_by_expr); - - return block_with_partition_by_expr.getByName(actions_with_column_name.column_name).column; -} - -std::string WildcardPartitionStrategy::getPathForRead( - const std::string & prefix) -{ - return prefix; -} - -std::string WildcardPartitionStrategy::getPathForWrite( - const std::string & prefix, - const std::string & partition_key) -{ - return PartitionedSink::replaceWildcards(prefix, partition_key); -} - -HiveStylePartitionStrategy::HiveStylePartitionStrategy( - KeyDescription partition_key_description_, - const Block & sample_block_, - ContextPtr context_, - const std::string & file_format_, - bool partition_columns_in_data_file_) - : IPartitionStrategy(partition_key_description_, sample_block_, context_), - file_format(file_format_), - partition_columns_in_data_file(partition_columns_in_data_file_) -{ - const auto partition_columns = getPartitionColumns(); - for (const auto & partition_column : partition_columns) - { - partition_columns_name_set.insert(partition_column.name); - } - actions_with_column_name = buildExpressionHive(partition_key_description.definition_ast, partition_columns, sample_block, context); - block_without_partition_columns = buildBlockWithoutPartitionColumns(sample_block, partition_columns_name_set); -} - -std::string HiveStylePartitionStrategy::getPathForRead(const std::string & prefix) -{ - return prefix + "**." + Poco::toLower(file_format); -} - -std::string HiveStylePartitionStrategy::getPathForWrite( - const std::string & prefix, - const std::string & partition_key) -{ - std::string path; - - if (!prefix.empty()) - { - path += prefix; - if (path.back() != '/') - { - path += '/'; - } - } - - /// Not adding '/' because buildExpressionHive() always adds a trailing '/' - path += partition_key; - - /* - * File extension is toLower(format) - * This isn't ideal, but I guess multiple formats can be specified and introduced. - * So I think it is simpler to keep it this way. - * - * Or perhaps implement something like `IInputFormat::getFileExtension()` - */ - path += std::to_string(generateSnowflakeID()) + "." + Poco::toLower(file_format); - - return path; -} - -ColumnPtr HiveStylePartitionStrategy::computePartitionKey(const Chunk & chunk) -{ - Block block_with_partition_by_expr = sample_block.cloneWithoutColumns(); - block_with_partition_by_expr.setColumns(chunk.getColumns()); - actions_with_column_name.actions->execute(block_with_partition_by_expr); - - return block_with_partition_by_expr.getByName(actions_with_column_name.column_name).column; -} - -ColumnRawPtrs HiveStylePartitionStrategy::getFormatChunkColumns(const Chunk & chunk) -{ - ColumnRawPtrs result; - if (partition_columns_in_data_file) - { - for (const auto & column : chunk.getColumns()) - { - result.emplace_back(column.get()); - } - - return result; - } - - if (chunk.getNumColumns() != sample_block.columns()) - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Incorrect number of columns in chunk. Expected {}, found {}", - sample_block.columns(), chunk.getNumColumns()); - } - - for (size_t i = 0; i < sample_block.columns(); i++) - { - if (!partition_columns_name_set.contains(sample_block.getByPosition(i).name)) - { - result.emplace_back(chunk.getColumns()[i].get()); - } - } - - return result; -} - -Block HiveStylePartitionStrategy::getFormatHeader() -{ - if (partition_columns_in_data_file) - { - return sample_block; - } - - return block_without_partition_columns; -} - -} diff --git a/src/Storages/IPartitionStrategy.h b/src/Storages/IPartitionStrategy.h deleted file mode 100644 index bc90d7f03461..000000000000 --- a/src/Storages/IPartitionStrategy.h +++ /dev/null @@ -1,128 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - -/* - * Class responsible for computing and generating a partition key for object storage. - * As of now, there are two possible implementations: hive and wildcard. - * - * It also offers some helper APIs like `getFormatChunk` and `getFormatHeader`. Required mostly because of `hive` strategy - * since the default behavior is not to write partition columns in the files and rely only on the file path. - */ -struct IPartitionStrategy -{ - struct PartitionExpressionActionsAndColumnName - { - ExpressionActionsPtr actions; - std::string column_name; - }; - - IPartitionStrategy(KeyDescription partition_key_description_, const Block & sample_block_, ContextPtr context_); - - virtual ~IPartitionStrategy() = default; - - virtual ColumnPtr computePartitionKey(const Chunk & chunk) = 0; - - virtual std::string getPathForRead(const std::string & prefix) = 0; - virtual std::string getPathForWrite(const std::string & prefix, const std::string & partition_key) = 0; - - virtual ColumnRawPtrs getFormatChunkColumns(const Chunk & chunk) - { - ColumnRawPtrs result_columns; - - for (const auto & column : chunk.getColumns()) - { - result_columns.emplace_back(column.get()); - } - - return result_columns; - } - - virtual Block getFormatHeader() { return sample_block; } - - NamesAndTypesList getPartitionColumns() const; - const KeyDescription & getPartitionKeyDescription() const; - -protected: - const KeyDescription partition_key_description; - const Block sample_block; - ContextPtr context; -}; - -/* - * Tries to create a partition strategy given a strategy name. - * Performs validation on required arguments by each strategy. Example: Partition strategy hive can not be used without a PARTITION BY expression - */ -struct PartitionStrategyFactory -{ - enum class StrategyType - { - NONE, /// The default for data lakes. To keep it backwards compatible, it is promoted to `WILDCARD` in case it is plain object storage - WILDCARD, - HIVE - }; - - static std::shared_ptr get( - StrategyType strategy, - ASTPtr partition_by, - const NamesAndTypesList & partition_columns, - ContextPtr context, - const std::string & file_format, - bool globbed_path, - bool contains_partition_wildcard, - bool partition_columns_in_data_file); -}; - -/* - * Simply wraps the partition expression with a `toString` function call. - * Path for reading is an identity function - * Path for writing replaces the `{_partition_id}` wildcard with the partition key. - */ -struct WildcardPartitionStrategy : IPartitionStrategy -{ - WildcardPartitionStrategy(KeyDescription partition_key_description_, const Block & sample_block_, ContextPtr context_); - - ColumnPtr computePartitionKey(const Chunk & chunk) override; - std::string getPathForRead(const std::string & prefix) override; - std::string getPathForWrite(const std::string & prefix, const std::string & partition_key) override; - -private: - PartitionExpressionActionsAndColumnName actions_with_column_name; -}; - -/* - * Builds partition keys in the hive format (e.g, key1=value1/key2=value2/) - * Path for reading appends recursive reading + file extension (e.g **.parquet) - * Path for writing appends partition key, snowflakeid as file name and file extension (e.g, table_root/key1=value1/key2=value2/1933642830979268608.parquet). - */ -struct HiveStylePartitionStrategy : IPartitionStrategy -{ - HiveStylePartitionStrategy( - KeyDescription partition_key_description_, - const Block & sample_block_, - ContextPtr context_, - const std::string & file_format_, - bool partition_columns_in_data_file_); - - ColumnPtr computePartitionKey(const Chunk & chunk) override; - std::string getPathForRead(const std::string & prefix) override; - std::string getPathForWrite(const std::string & prefix, const std::string & partition_key) override; - - ColumnRawPtrs getFormatChunkColumns(const Chunk & chunk) override; - Block getFormatHeader() override; - -private: - const std::string file_format; - const bool partition_columns_in_data_file; - std::unordered_set partition_columns_name_set; - PartitionExpressionActionsAndColumnName actions_with_column_name; - Block block_without_partition_columns; -}; - -} diff --git a/src/Storages/ObjectStorage/Azure/Configuration.cpp b/src/Storages/ObjectStorage/Azure/Configuration.cpp index 4aa9b177d8e2..eec50b5aadb4 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.cpp +++ b/src/Storages/ObjectStorage/Azure/Configuration.cpp @@ -56,7 +56,6 @@ const std::unordered_set optional_configuration_keys = { "account_key", "connection_string", "storage_account_url", - "partition_strategy" }; void StorageAzureConfiguration::check(ContextPtr context) const @@ -150,21 +149,6 @@ void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & coll format = collection.getOrDefault("format", format); compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); - if (collection.has("partition_strategy")) - { - const auto partition_strategy_name = collection.get("partition_strategy"); - const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); - - if (!partition_strategy_type_opt) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} is not supported", partition_strategy_name); - } - - partition_strategy_type = partition_strategy_type_opt.value(); - } - - partition_columns_in_data_file = collection.getOrDefault("partition_columns_in_data_file", partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE); - blobs_paths = {blob_path}; connection_params = getConnectionParams(connection_url, container_name, account_name, account_key, context); } @@ -234,23 +218,14 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); if (is_format_arg(fourth_arg)) { - format = fourth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); - - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "partition_strategy/structure"); - if (magic_enum::enum_contains(sixth_arg, magic_enum::case_insensitive)) + if (with_structure) { - partition_strategy_type = magic_enum::enum_cast(sixth_arg, magic_enum::case_insensitive).value(); + format = fourth_arg; + compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); + structure = checkAndGetLiteralArgument(engine_args[5], "structure"); } else - { - if (with_structure) - { - structure = sixth_arg; - } - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", sixth_arg); - } + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); } else { @@ -271,147 +246,24 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, } } else if (engine_args.size() == 7) - { - const auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - - if (is_format_arg(fourth_arg)) - { - format = fourth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); - const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[5], "partition_strategy"); - const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); - - if (!partition_strategy_type_opt) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); - } - - partition_strategy_type = partition_strategy_type_opt.value(); - - /// If it's of type String, then it is not `partition_columns_in_data_file` - if (const auto seventh_arg = tryGetLiteralArgument(engine_args[6], "structure/partition_columns_in_data_file")) - { - if (with_structure) - { - structure = seventh_arg.value(); - } - else - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected `partition_columns_in_data_file` of type boolean, but found: {}", seventh_arg.value()); - } - } - else - { - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[6], "partition_columns_in_data_file"); - } - } - else - { - if (!with_structure && is_format_arg(fourth_arg)) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); - } - - account_name = fourth_arg; - account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); - if (!is_format_arg(sixth_arg)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - format = sixth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); - } - } - else if (engine_args.size() == 8) { auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); - - if (is_format_arg(fourth_arg)) - { - if (!with_structure) - { - /// If the fourth argument is a format, then it means a connection string is being used. - /// When using a connection string, the function only accepts 8 arguments in case `with_structure=true` - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid sequence / combination of arguments"); - } - format = fourth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[4], "compression"); - const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[5], "partition_strategy"); - const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); - - if (!partition_strategy_type_opt) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); - } - - partition_strategy_type = partition_strategy_type_opt.value(); - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[6], "partition_columns_in_data_file"); - structure = checkAndGetLiteralArgument(engine_args[7], "structure"); - } - else + if (!with_structure && is_format_arg(fourth_arg)) { - account_name = fourth_arg; - account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format"); - if (!is_format_arg(sixth_arg)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); - format = sixth_arg; - compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); - - auto eighth_arg = checkAndGetLiteralArgument(engine_args[7], "partition_strategy/structure"); - if (magic_enum::enum_contains(eighth_arg, magic_enum::case_insensitive)) - { - partition_strategy_type = magic_enum::enum_cast(eighth_arg, magic_enum::case_insensitive).value(); - } - else - { - if (with_structure) - { - structure = eighth_arg; - } - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", eighth_arg); - } + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Format and compression must be last arguments"); } - } - else if (engine_args.size() == 9) - { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + account_name = fourth_arg; account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); - auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format"); + auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format/account_name"); if (!is_format_arg(sixth_arg)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); format = sixth_arg; compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); - - const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[7], "partition_strategy"); - const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); - - if (!partition_strategy_type_opt) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); - } - partition_strategy_type = partition_strategy_type_opt.value(); - /// If it's of type String, then it is not `partition_columns_in_data_file` - if (const auto nineth_arg = tryGetLiteralArgument(engine_args[8], "structure/partition_columns_in_data_file")) - { - if (with_structure) - { - structure = nineth_arg.value(); - } - else - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expected `partition_columns_in_data_file` of type boolean, but found: {}", nineth_arg.value()); - } - } - else - { - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[8], "partition_columns_in_data_file"); - } } - else if (engine_args.size() == 10 && with_structure) + else if (with_structure && engine_args.size() == 8) { - auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "format/account_name"); + auto fourth_arg = checkAndGetLiteralArgument(engine_args[3], "account_name"); account_name = fourth_arg; account_key = checkAndGetLiteralArgument(engine_args[4], "account_key"); auto sixth_arg = checkAndGetLiteralArgument(engine_args[5], "format"); @@ -419,17 +271,7 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context, throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg); format = sixth_arg; compression_method = checkAndGetLiteralArgument(engine_args[6], "compression"); - - const auto partition_strategy_name = checkAndGetLiteralArgument(engine_args[7], "partition_strategy"); - const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); - - if (!partition_strategy_type_opt) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown partition strategy {}", partition_strategy_name); - } - partition_strategy_type = partition_strategy_type_opt.value(); - partition_columns_in_data_file = checkAndGetLiteralArgument(engine_args[8], "partition_columns_in_data_file"); - structure = checkAndGetLiteralArgument(engine_args[9], "structure"); + structure = checkAndGetLiteralArgument(engine_args[7], "structure"); } blobs_paths = {blob_path}; diff --git a/src/Storages/ObjectStorage/Azure/Configuration.h b/src/Storages/ObjectStorage/Azure/Configuration.h index 303232b72184..157da6047ce2 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.h +++ b/src/Storages/ObjectStorage/Azure/Configuration.h @@ -25,38 +25,28 @@ class StorageAzureConfiguration : public StorageObjectStorage::Configuration static constexpr auto type_name = "azure"; static constexpr auto engine_name = "Azure"; /// All possible signatures for Azure engine with structure argument (for example for azureBlobStorage table function). - static constexpr auto max_number_of_arguments_with_structure = 10; + static constexpr auto max_number_of_arguments_with_structure = 8; static constexpr auto signatures_with_structure = " - connection_string, container_name, blobpath\n" " - connection_string, container_name, blobpath, structure \n" " - connection_string, container_name, blobpath, format \n" " - connection_string, container_name, blobpath, format, compression \n" - " - connection_string, container_name, blobpath, format, compression, partition_strategy \n" " - connection_string, container_name, blobpath, format, compression, structure \n" - " - connection_string, container_name, blobpath, format, compression, partition_strategy, structure \n" - " - connection_string, container_name, blobpath, format, compression, partition_strategy, partition_columns_in_data_file \n" - " - connection_string, container_name, blobpath, format, compression, partition_strategy, partition_columns_in_data_file, structure \n" " - storage_account_url, container_name, blobpath, account_name, account_key\n" " - storage_account_url, container_name, blobpath, account_name, account_key, structure\n" " - storage_account_url, container_name, blobpath, account_name, account_key, format\n" " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, partition_strategy\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, partition_strategy, structure\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, partition_strategy, partition_columns_in_data_file\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, partition_strategy, partition_columns_in_data_file, structure\n"; + " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure\n"; /// All possible signatures for Azure engine without structure argument (for example for AzureBlobStorage table engine). - static constexpr auto max_number_of_arguments_without_structure = 9; + static constexpr auto max_number_of_arguments_without_structure = 7; static constexpr auto signatures_without_structure = " - connection_string, container_name, blobpath\n" " - connection_string, container_name, blobpath, format \n" " - connection_string, container_name, blobpath, format, compression \n" " - storage_account_url, container_name, blobpath, account_name, account_key\n" " - storage_account_url, container_name, blobpath, account_name, account_key, format\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, partition_strategy\n" - " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression, partition_strategy, partition_columns_in_data_file\n"; + " - storage_account_url, container_name, blobpath, account_name, account_key, format, compression\n"; StorageAzureConfiguration() = default; @@ -67,7 +57,8 @@ class StorageAzureConfiguration : public StorageObjectStorage::Configuration std::string getSignatures(bool with_structure = true) const { return with_structure ? signatures_with_structure : signatures_without_structure; } size_t getMaxNumberOfArguments(bool with_structure = true) const { return with_structure ? max_number_of_arguments_with_structure : max_number_of_arguments_without_structure; } - Path getRawPath() const override { return blob_path; } + Path getPath() const override { return blob_path; } + void setPath(const Path & path) override { blob_path = path; } const Paths & getPaths() const override { return blobs_paths; } void setPaths(const Paths & paths) override { blobs_paths = paths; } @@ -91,8 +82,8 @@ class StorageAzureConfiguration : public StorageObjectStorage::Configuration void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; - Path blob_path; - Paths blobs_paths; + std::string blob_path; + std::vector blobs_paths; AzureBlobStorage::ConnectionParams connection_params; }; diff --git a/src/Storages/ObjectStorage/DataLakes/Common.cpp b/src/Storages/ObjectStorage/DataLakes/Common.cpp index d2bc6ce6be59..d23a2826e40d 100644 --- a/src/Storages/ObjectStorage/DataLakes/Common.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Common.cpp @@ -13,7 +13,7 @@ std::vector listFiles( const StorageObjectStorage::Configuration & configuration, const String & prefix, const String & suffix) { - auto key = std::filesystem::path(configuration.getPathForRead().path) / prefix; + auto key = std::filesystem::path(configuration.getPath()) / prefix; RelativePathsWithMetadata files_with_metadata; object_storage.listObjects(key, files_with_metadata, 0); Strings res; diff --git a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h index 844f631f5d7d..8f4928c038f2 100644 --- a/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h +++ b/src/Storages/ObjectStorage/DataLakes/DataLakeConfiguration.h @@ -184,8 +184,7 @@ class DataLakeConfiguration : public BaseStorageConfiguration, public std::enabl const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, bool supports_subset_of_columns, - ContextPtr local_context, - const PrepareReadingFromFormatHiveParams &) override + ContextPtr local_context) override { if (!current_metadata) { diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp index 48be506daa55..6da0c7747db2 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLake/KernelHelper.cpp @@ -188,7 +188,7 @@ DeltaLake::KernelHelperPtr getKernelHelper( case DB::ObjectStorageType::Local: { const auto * local_conf = dynamic_cast(configuration.get()); - return std::make_shared(local_conf->getPathForRead().path); + return std::make_shared(local_conf->getPath()); } default: { diff --git a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp index e640e467c6d9..e6b876d4403d 100644 --- a/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/DeltaLakeMetadata.cpp @@ -176,7 +176,7 @@ struct DeltaLakeMetadataImpl while (true) { const auto filename = withPadding(++current_version) + metadata_file_suffix; - const auto file_path = std::filesystem::path(configuration_ptr->getPathForRead().path) / deltalake_metadata_directory / filename; + const auto file_path = std::filesystem::path(configuration_ptr->getPath()) / deltalake_metadata_directory / filename; if (!object_storage->exists(StoredObject(file_path))) break; @@ -301,7 +301,6 @@ struct DeltaLakeMetadataImpl } auto configuration_ptr = configuration.lock(); - const auto read_path_string = configuration_ptr->getPathForRead().path; if (object->has("add")) { @@ -310,7 +309,7 @@ struct DeltaLakeMetadataImpl throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to extract `add` field"); auto path = add_object->getValue("path"); - auto full_path = fs::path(read_path_string) / path; + auto full_path = fs::path(configuration_ptr->getPath()) / path; result.insert(full_path); auto filename = fs::path(path).filename().string(); @@ -355,7 +354,7 @@ struct DeltaLakeMetadataImpl throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to extract `remove` field"); auto path = remove_object->getValue("path"); - result.erase(fs::path(read_path_string) / path); + result.erase(fs::path(configuration_ptr->getPath()) / path); } } } @@ -404,7 +403,7 @@ struct DeltaLakeMetadataImpl { auto configuration_ptr = configuration.lock(); const auto last_checkpoint_file - = std::filesystem::path(configuration_ptr->getPathForRead().path) / deltalake_metadata_directory / "_last_checkpoint"; + = std::filesystem::path(configuration_ptr->getPath()) / deltalake_metadata_directory / "_last_checkpoint"; if (!object_storage->exists(StoredObject(last_checkpoint_file))) return 0; @@ -475,7 +474,7 @@ struct DeltaLakeMetadataImpl auto configuration_ptr = configuration.lock(); const auto checkpoint_path - = std::filesystem::path(configuration_ptr->getPathForRead().path) / deltalake_metadata_directory / checkpoint_filename; + = std::filesystem::path(configuration_ptr->getPath()) / deltalake_metadata_directory / checkpoint_filename; LOG_TRACE(log, "Using checkpoint file: {}", checkpoint_path.string()); @@ -562,7 +561,7 @@ struct DeltaLakeMetadataImpl continue; auto filename = fs::path(path).filename().string(); - auto full_path = fs::path(configuration_ptr->getPathForRead().path) / path; + auto full_path = fs::path(configuration_ptr->getPath()) / path; auto it = file_partition_columns.find(full_path); if (it == file_partition_columns.end()) { @@ -594,7 +593,7 @@ struct DeltaLakeMetadataImpl } LOG_TEST(log, "Adding {}", path); - const auto [_, inserted] = result.insert(std::filesystem::path(configuration_ptr->getPathForRead().path) / path); + const auto [_, inserted] = result.insert(std::filesystem::path(configuration_ptr->getPath()) / path); if (!inserted) throw Exception(ErrorCodes::INCORRECT_DATA, "File already exists {}", path); } diff --git a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp index c2b8caa64c55..dafc6f46370e 100644 --- a/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp +++ b/src/Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.cpp @@ -330,7 +330,7 @@ static std::pair getLatestMetadataFileAndVersion( if (metadata_files.empty()) { throw Exception( - ErrorCodes::FILE_DOESNT_EXIST, "The metadata file for Iceberg table with path {} doesn't exist", configuration_ptr->getPathForRead().path); + ErrorCodes::FILE_DOESNT_EXIST, "The metadata file for Iceberg table with path {} doesn't exist", configuration_ptr->getPath()); } std::vector metadata_files_with_versions; metadata_files_with_versions.reserve(metadata_files.size()); @@ -413,7 +413,7 @@ static std::pair getLatestOrExplicitMetadataFileAndVersion( if (*it == "." || *it == "..") throw Exception(ErrorCodes::BAD_ARGUMENTS, "Relative paths are not allowed"); } - auto prefix_storage_path = configuration_ptr->getPathForRead().path; + auto prefix_storage_path = configuration_ptr->getPath(); if (!explicit_metadata_path.starts_with(prefix_storage_path)) explicit_metadata_path = std::filesystem::path(prefix_storage_path) / explicit_metadata_path; return getMetadataFileAndVersion(explicit_metadata_path); @@ -430,7 +430,7 @@ static std::pair getLatestOrExplicitMetadataFileAndVersion( } else if (data_lake_settings[DataLakeStorageSetting::iceberg_use_version_hint].value) { - auto prefix_storage_path = configuration_ptr->getPathForRead().path; + auto prefix_storage_path = configuration_ptr->getPath(); auto version_hint_path = std::filesystem::path(prefix_storage_path) / "metadata" / "version-hint.text"; std::string metadata_file; StoredObject version_hint(version_hint_path); @@ -499,7 +499,7 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec throw Exception( ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "No snapshot set found in metadata for iceberg table `{}`, it is impossible to get manifest list by snapshot id `{}`", - configuration_ptr->getPathForRead().path, + configuration_ptr->getPath(), relevant_snapshot_id); auto snapshots = metadata_object->get(f_snapshots).extract(); for (size_t i = 0; i < snapshots->size(); ++i) @@ -512,7 +512,7 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "No manifest list found for snapshot id `{}` for iceberg table `{}`", relevant_snapshot_id, - configuration_ptr->getPathForRead().path); + configuration_ptr->getPath()); std::optional total_rows; std::optional total_bytes; @@ -528,7 +528,7 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec relevant_snapshot = IcebergSnapshot{ getManifestList(local_context, getProperFilePathFromMetadataInfo( - snapshot->getValue(f_manifest_list), configuration_ptr->getPathForRead().path, table_location)), + snapshot->getValue(f_manifest_list), configuration_ptr->getPath(), table_location)), relevant_snapshot_id, total_rows, total_bytes}; if (!snapshot->has(f_schema_id)) @@ -536,7 +536,7 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec ErrorCodes::ICEBERG_SPECIFICATION_VIOLATION, "No schema id found for snapshot id `{}` for iceberg table `{}`", relevant_snapshot_id, - configuration_ptr->getPathForRead().path); + configuration_ptr->getPath()); relevant_snapshot_schema_id = snapshot->getValue(f_schema_id); addTableSchemaById(relevant_snapshot_schema_id, metadata_object); return; @@ -546,7 +546,7 @@ void IcebergMetadata::updateSnapshot(ContextPtr local_context, Poco::JSON::Objec ErrorCodes::BAD_ARGUMENTS, "No manifest list is found for snapshot id `{}` in metadata for iceberg table `{}`", relevant_snapshot_id, - configuration_ptr->getPathForRead().path); + configuration_ptr->getPath()); } void IcebergMetadata::updateState(const ContextPtr & local_context, Poco::JSON::Object::Ptr metadata_object, bool metadata_file_changed) @@ -561,14 +561,14 @@ void IcebergMetadata::updateState(const ContextPtr & local_context, Poco::JSON:: throw Exception( ErrorCodes::BAD_ARGUMENTS, "Time travel with timestamp and snapshot id for iceberg table by path {} cannot be changed simultaneously", - configuration_ptr->getPathForRead().path); + configuration_ptr->getPath()); } if (timestamp_changed) { Int64 closest_timestamp = 0; Int64 query_timestamp = local_context->getSettingsRef()[Setting::iceberg_timestamp_ms]; if (!metadata_object->has(f_snapshot_log)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "No snapshot log found in metadata for iceberg table {} so it is impossible to get relevant snapshot id using timestamp", configuration_ptr->getPathForRead().path); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No snapshot log found in metadata for iceberg table {} so it is impossible to get relevant snapshot id using timestamp", configuration_ptr->getPath()); auto snapshots = metadata_object->get(f_snapshot_log).extract(); relevant_snapshot_id = -1; for (size_t i = 0; i < snapshots->size(); ++i) @@ -582,7 +582,7 @@ void IcebergMetadata::updateState(const ContextPtr & local_context, Poco::JSON:: } } if (relevant_snapshot_id < 0) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "No snapshot found in snapshot log before requested timestamp for iceberg table {}", configuration_ptr->getPathForRead().path); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No snapshot found in snapshot log before requested timestamp for iceberg table {}", configuration_ptr->getPath()); updateSnapshot(local_context, metadata_object); } else if (snapshot_id_changed) @@ -647,6 +647,7 @@ std::optional IcebergMetadata::getSchemaVersionByFileIfOutdated(String da return std::optional{schema_id}; } + DataLakeMetadataPtr IcebergMetadata::create( const ObjectStoragePtr & object_storage, const ConfigurationObserverPtr & configuration, @@ -710,7 +711,7 @@ ManifestFileCacheKeys IcebergMetadata::getManifestList(ContextPtr local_context, for (size_t i = 0; i < manifest_list_deserializer.rows(); ++i) { const std::string file_path = manifest_list_deserializer.getValueFromRowByName(i, f_manifest_path, TypeIndex::String).safeGet(); - const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPathForRead().path, table_location); + const auto manifest_file_name = getProperFilePathFromMetadataInfo(file_path, configuration_ptr->getPath(), table_location); Int64 added_sequence_number = 0; if (format_version > 1) added_sequence_number = manifest_list_deserializer.getValueFromRowByName(i, f_sequence_number, TypeIndex::Int64).safeGet(); @@ -840,7 +841,7 @@ ManifestFilePtr IcebergMetadata::getManifestFile(ContextPtr local_context, const return std::make_shared( manifest_file_deserializer, format_version, - configuration_ptr->getPathForRead().path, + configuration_ptr->getPath(), schema_id, schema_object, schema_processor, diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index b2d9a1b7d99a..73fbafcb388b 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -44,7 +44,7 @@ namespace ErrorCodes void StorageHDFSConfiguration::check(ContextPtr context) const { context->getRemoteHostFilter().checkURL(Poco::URI(url)); - checkHDFSURL(fs::path(url) / path.path.substr(1)); + checkHDFSURL(fs::path(url) / path.substr(1)); Configuration::check(context); } @@ -59,6 +59,17 @@ ObjectStoragePtr StorageHDFSConfiguration::createObjectStorage( /// NOLINT url, std::move(hdfs_settings), context->getConfigRef(), /* lazy_initialize */true); } +std::string StorageHDFSConfiguration::getPathWithoutGlobs() const +{ + /// Unlike s3 and azure, which are object storages, + /// hdfs is a filesystem, so it cannot list files by partual prefix, + /// only by directory. + auto first_glob_pos = path.find_first_of("*?{"); + auto end_of_path_without_globs = path.substr(0, first_glob_pos).rfind('/'); + if (end_of_path_without_globs == std::string::npos || end_of_path_without_globs == 0) + return "/"; + return path.substr(0, end_of_path_without_globs); +} StorageObjectStorage::QuerySettings StorageHDFSConfiguration::getQuerySettings(const ContextPtr & context) const { const auto & settings = context->getSettingsRef(); @@ -143,13 +154,13 @@ void StorageHDFSConfiguration::setURL(const std::string & url_) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad HDFS URL: {}. It should have the following structure 'hdfs://:/path'", url_); path = url_.substr(pos + 1); - if (!path.path.starts_with('/')) - path = '/' + path.path; + if (!path.starts_with('/')) + path = '/' + path; url = url_.substr(0, pos); paths = {path}; - LOG_TRACE(getLogger("StorageHDFSConfiguration"), "Using URL: {}, path: {}", url, path.path); + LOG_TRACE(getLogger("StorageHDFSConfiguration"), "Using URL: {}, path: {}", url, path); } void StorageHDFSConfiguration::addStructureAndFormatToArgsIfNeeded( diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 81022c54aadd..698dd4a8e354 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -41,19 +41,12 @@ class StorageHDFSConfiguration : public StorageObjectStorage::Configuration std::string getSignatures(bool with_structure = true) const { return with_structure ? signatures_with_structure : signatures_without_structure; } size_t getMaxNumberOfArguments(bool with_structure = true) const { return with_structure ? max_number_of_arguments_with_structure : max_number_of_arguments_without_structure; } - bool supportsPartialPathPrefix() const override { return false; } - - /// Unlike s3 and azure, which are object storages, - /// hdfs is a filesystem, so it cannot list files by partial prefix, - /// only by directory. - /// Therefore in the below methods we use supports_partial_prefix=false. - Path getRawPath() const override { return path; } + Path getPath() const override { return path; } + void setPath(const Path & path_) override { path = path_; } const Paths & getPaths() const override { return paths; } - void setPaths(const Paths & paths_) override - { - paths = paths_; - } + void setPaths(const Paths & paths_) override { paths = paths_; } + std::string getPathWithoutGlobs() const override; String getNamespace() const override { return ""; } String getDataSourceDescription() const override { return url; } @@ -76,8 +69,8 @@ class StorageHDFSConfiguration : public StorageObjectStorage::Configuration void setURL(const std::string & url_); String url; - Path path; - Paths paths; + String path; + std::vector paths; }; } diff --git a/src/Storages/ObjectStorage/Local/Configuration.h b/src/Storages/ObjectStorage/Local/Configuration.h index 1654bb36d79f..6b8f8d6c092b 100644 --- a/src/Storages/ObjectStorage/Local/Configuration.h +++ b/src/Storages/ObjectStorage/Local/Configuration.h @@ -45,7 +45,8 @@ class StorageLocalConfiguration : public StorageObjectStorage::Configuration std::string getSignatures(bool with_structure = true) const { return with_structure ? signatures_with_structure : signatures_without_structure; } size_t getMaxNumberOfArguments(bool with_structure = true) const { return with_structure ? max_number_of_arguments_with_structure : max_number_of_arguments_without_structure; } - Path getRawPath() const override { return path; } + Path getPath() const override { return path; } + void setPath(const Path & path_) override { path = path_; } const Paths & getPaths() const override { return paths; } void setPaths(const Paths & paths_) override { paths = paths_; } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index dc552fad8518..6a468b33a04c 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -21,7 +21,6 @@ #include #include #include -#include namespace DB { @@ -53,7 +52,6 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int LOGICAL_ERROR; - extern const int BAD_ARGUMENTS; } static const std::unordered_set required_configuration_keys = { @@ -77,9 +75,7 @@ static const std::unordered_set optional_configuration_keys = "max_single_part_upload_size", "max_connections", "expiration_window_seconds", - "no_sign_request", - "partition_strategy", - "partition_columns_in_data_file" + "no_sign_request" }; String StorageS3Configuration::getDataSourceDescription() const @@ -92,7 +88,7 @@ std::string StorageS3Configuration::getPathInArchive() const if (url.archive_pattern.has_value()) return url.archive_pattern.value(); - throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} is not an archive", getRawPath().path); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} is not an archive", getPath()); } void StorageS3Configuration::check(ContextPtr context) const @@ -175,21 +171,6 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect auth_settings[S3AuthSetting::expiration_window_seconds] = collection.getOrDefault("expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS); auth_settings[S3AuthSetting::session_token] = collection.getOrDefault("session_token", ""); - if (collection.has("partition_strategy")) - { - const auto partition_strategy_name = collection.get("partition_strategy"); - const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); - - if (!partition_strategy_type_opt) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} is not supported", partition_strategy_name); - } - - partition_strategy_type = partition_strategy_type_opt.value(); - } - - partition_columns_in_data_file = collection.getOrDefault("partition_columns_in_data_file", partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE); - format = collection.getOrDefault("format", format); compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); structure = collection.getOrDefault("structure", "auto"); @@ -364,60 +345,10 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"compression_method", 5}}; } } - /// For 7 arguments we support: - /// if with_structure == 0: - /// - s3(source, access_key_id, secret_access_key, session_token, format, compression_method, partition_strategy) - /// if with_structure == 1: - /// - s3(source, access_key_id, secret_access_key, session_token, format, structure, partition_strategy) - /// - s3(source, access_key_id, secret_access_key, session_token, format, structure, compression_method) - else if (count == 7) - { - if (with_structure) - { - auto sixth_arg = checkAndGetLiteralArgument(args[6], "compression_method/partition_strategy"); - if (magic_enum::enum_contains(sixth_arg)) - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"partition_strategy", 6}}; - } - else - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"compression_method", 6}}; - } - } - else - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"compression_method", 5}, {"partition_strategy", 6}}; - } - } - /// For 8 arguments we support: - /// if with_structure == 0: - /// - s3(source, access_key_id, secret_access_key, session_token, format, compression_method, partition_strategy, partition_columns_in_data_file) - /// if with_structure == 1: - /// - s3(source, access_key_id, secret_access_key, session_token, format, structure, partition_strategy, partition_columns_in_data_file) - /// - s3(source, access_key_id, secret_access_key, session_token, format, structure, compression_method, partition_strategy) - else if (count == 8) - { - if (with_structure) - { - auto sixth_arg = checkAndGetLiteralArgument(args[6], "compression_method/partition_strategy"); - if (magic_enum::enum_contains(sixth_arg)) - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"partition_strategy", 6}, {"partition_columns_in_data_file", 7}}; - } - else - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"compression_method", 6}, {"partition_strategy", 7}}; - } - } - else - { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"compression_method", 5}, {"partition_strategy", 6}, {"partition_columns_in_data_file", 7}}; - } - } - /// s3(source, access_key_id, secret_access_key, session_token, format, structure, compression_method, partition_strategy, partition_columns_in_data_file) - else if (with_structure && count == 9) + /// s3(source, access_key_id, secret_access_key, session_token, format, structure, compression_method) + else if (with_structure && count == 7) { - engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"compression_method", 6}, {"partition_strategy", 7}, {"partition_columns_in_data_file", 8}}; + engine_args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"session_token", 3}, {"format", 4}, {"structure", 5}, {"compression_method", 6}}; } /// This argument is always the first @@ -438,24 +369,6 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ if (engine_args_to_idx.contains("compression_method")) compression_method = checkAndGetLiteralArgument(args[engine_args_to_idx["compression_method"]], "compression_method"); - if (engine_args_to_idx.contains("partition_strategy")) - { - const auto partition_strategy_name = checkAndGetLiteralArgument(args[engine_args_to_idx["partition_strategy"]], "partition_strategy"); - const auto partition_strategy_type_opt = magic_enum::enum_cast(partition_strategy_name, magic_enum::case_insensitive); - - if (!partition_strategy_type_opt) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Partition strategy {} is not supported", partition_strategy_name); - } - - partition_strategy_type = partition_strategy_type_opt.value(); - } - - if (engine_args_to_idx.contains("partition_columns_in_data_file")) - partition_columns_in_data_file = checkAndGetLiteralArgument(args[engine_args_to_idx["partition_columns_in_data_file"]], "partition_columns_in_data_file"); - else - partition_columns_in_data_file = partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE; - if (engine_args_to_idx.contains("access_key_id")) auth_settings[S3AuthSetting::access_key_id] = checkAndGetLiteralArgument(args[engine_args_to_idx["access_key_id"]], "access_key_id"); @@ -654,7 +567,7 @@ void StorageS3Configuration::addStructureAndFormatToArgsIfNeeded( } } /// s3(source, access_key_id, secret_access_key, session_token, format, structure, compression_method) - else + else if (count == 7) { if (checkAndGetLiteralArgument(args[4], "format") == "auto") args[4] = format_literal; diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 2804fd2a1104..52fcc45390c0 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -19,7 +19,7 @@ class StorageS3Configuration : public StorageObjectStorage::Configuration static constexpr auto type_name = "s3"; static constexpr auto namespace_name = "bucket"; /// All possible signatures for S3 storage with structure argument (for example for s3 table function). - static constexpr auto max_number_of_arguments_with_structure = 9; + static constexpr auto max_number_of_arguments_with_structure = 7; static constexpr auto signatures_with_structure = " - url\n" " - url, NOSIGN\n" @@ -37,14 +37,10 @@ class StorageS3Configuration : public StorageObjectStorage::Configuration " - url, access_key_id, secret_access_key, session_token, format, structure\n" " - url, access_key_id, secret_access_key, format, structure, compression_method\n" " - url, access_key_id, secret_access_key, session_token, format, structure, compression_method\n" - " - url, access_key_id, secret_access_key, session_token, format, structure, partition_strategy\n" - " - url, access_key_id, secret_access_key, session_token, format, structure, compression_method, partition_strategy\n" - " - url, access_key_id, secret_access_key, session_token, format, structure, partition_strategy, partition_columnns_in_data_file\n" - " - url, access_key_id, secret_access_key, session_token, format, structure, compression_method, partition_strategy, partition_columnns_in_data_file\n" "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; /// All possible signatures for S3 storage without structure argument (for example for S3 table engine). - static constexpr auto max_number_of_arguments_without_structure = 8; + static constexpr auto max_number_of_arguments_without_structure = 6; static constexpr auto signatures_without_structure = " - url\n" " - url, NOSIGN\n" @@ -58,8 +54,6 @@ class StorageS3Configuration : public StorageObjectStorage::Configuration " - url, access_key_id, secret_access_key, session_token, format\n" " - url, access_key_id, secret_access_key, format, compression_method\n" " - url, access_key_id, secret_access_key, session_token, format, compression_method\n" - " - url, access_key_id, secret_access_key, session_token, format, compression_method, partition_strategy\n" - " - url, access_key_id, secret_access_key, session_token, format, compression_method, partition_strategy, partition_columnns_in_data_file\n" "All signatures supports optional headers (specified as `headers('name'='value', 'name2'='value2')`)"; StorageS3Configuration() = default; @@ -75,7 +69,8 @@ class StorageS3Configuration : public StorageObjectStorage::Configuration S3::URI getURL() const { return url; } const S3::S3AuthSettings & getAuthSettings() const { return auth_settings; } - Path getRawPath() const override { return url.key; } + Path getPath() const override { return url.key; } + void setPath(const Path & path) override { url.key = path; } const Paths & getPaths() const override { return keys; } void setPaths(const Paths & paths) override { keys = paths; } @@ -105,7 +100,7 @@ class StorageS3Configuration : public StorageObjectStorage::Configuration void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; S3::URI url; - Paths keys; + std::vector keys; S3::S3AuthSettings auth_settings; S3::S3RequestSettings request_settings; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index f910691767aa..2f619c9dd728 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -46,7 +45,6 @@ namespace ErrorCodes extern const int DATABASE_ACCESS_DENIED; extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; - extern const int INCORRECT_DATA; } String StorageObjectStorage::getPathSample(ContextPtr context) @@ -69,15 +67,12 @@ String StorageObjectStorage::getPathSample(ContextPtr context) {}, // predicate {}, {}, // virtual_columns - {}, // hive_columns nullptr, // read_keys {} // file_progress_callback ); - const auto path = configuration->getRawPath(); - - if (!configuration->isArchive() && !path.hasGlobs() && !local_distributed_processing) - return path.path; + if (!configuration->isArchive() && !configuration->isPathWithGlobs() && !local_distributed_processing) + return configuration->getPath(); if (auto file = file_iterator->next(0)) return file->getPath(); @@ -89,7 +84,7 @@ StorageObjectStorage::StorageObjectStorage( ObjectStoragePtr object_storage_, ContextPtr context, const StorageID & table_id_, - const ColumnsDescription & columns_in_table_or_function_definition, + const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, std::optional format_settings_, @@ -102,14 +97,13 @@ StorageObjectStorage::StorageObjectStorage( , configuration(configuration_) , object_storage(object_storage_) , format_settings(format_settings_) + , partition_by(partition_by_) , distributed_processing(distributed_processing_) , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { - configuration->initPartitionStrategy(partition_by_, columns_in_table_or_function_definition, context); - - const bool need_resolve_columns_or_format = columns_in_table_or_function_definition.empty() || (configuration->format == "auto"); + const bool need_resolve_columns_or_format = columns_.empty() || (configuration->format == "auto"); const bool need_resolve_sample_path = context->getSettingsRef()[Setting::use_hive_partitioning] - && !configuration->partition_strategy + && !configuration->withPartitionWildcard() && !configuration->isDataLakeConfiguration(); const bool do_lazy_init = lazy_init && !need_resolve_columns_or_format && !need_resolve_sample_path; @@ -145,8 +139,7 @@ StorageObjectStorage::StorageObjectStorage( update_configuration_on_read_write = !is_table_function || !updated_configuration; std::string sample_path; - - ColumnsDescription columns{columns_in_table_or_function_definition}; + ColumnsDescription columns{columns_}; if (need_resolve_columns_or_format) resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, sample_path, context); else @@ -154,9 +147,14 @@ StorageObjectStorage::StorageObjectStorage( configuration->check(context); + StorageInMemoryMetadata metadata; + metadata.setColumns(columns); + metadata.setConstraints(constraints_); + metadata.setComment(comment); + /// FIXME: We need to call getPathSample() lazily on select /// in case it failed to be initialized in constructor. - if (updated_configuration && sample_path.empty() && need_resolve_sample_path && !configuration->partition_strategy) + if (updated_configuration && sample_path.empty() && need_resolve_sample_path) { try { @@ -172,77 +170,8 @@ StorageObjectStorage::StorageObjectStorage( } } - /* - * If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression. - * There is no need to read from the file's path. - * - * Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path. - * And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns - */ - if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE) - { - hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns(); - } - else if (context->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - columns_in_table_or_function_definition.empty(), - format_settings, - context); - } - - if (hive_partition_columns_to_read_from_file_path.size() == columns.size()) - { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`"); - } - - if (configuration->partition_columns_in_data_file) - { - file_columns = columns; - } - else - { - std::unordered_set hive_partition_columns_to_read_from_file_path_set; - - for (const auto & [name, type] : hive_partition_columns_to_read_from_file_path) - { - hive_partition_columns_to_read_from_file_path_set.insert(name); - } - - for (const auto & [name, type] : columns.getAllPhysical()) - { - if (!hive_partition_columns_to_read_from_file_path_set.contains(name)) - { - file_columns.add({name, type}); - } - } - } - - // Assert file contains at least one column. The assertion only takes place if we were able to deduce the schema. The storage might be empty. - if (!columns.empty() && file_columns.empty()) - { - throw Exception(ErrorCodes::INCORRECT_DATA, - "File without physical columns is not supported. Please try it with `use_hive_partitioning=0` and or `partition_strategy=wildcard`. File {}", - sample_path); - } - - StorageInMemoryMetadata metadata; - metadata.setColumns(columns); - metadata.setConstraints(constraints_); - metadata.setComment(comment); - - /// I am not sure this is actually required, but just in case - if (configuration->partition_strategy) - { - metadata.partition_key = configuration->partition_strategy->getPartitionKeyDescription(); - } - - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage( + metadata.columns, context, sample_path, format_settings, configuration->isDataLakeConfiguration())); setInMemoryMetadata(metadata); } @@ -451,7 +380,7 @@ class ReadFromObjectStorageStep : public SourceStepWithFilter auto context = getContext(); iterator_wrapper = StorageObjectStorageSource::createFileIterator( configuration, configuration->getQuerySettings(context), object_storage, distributed_processing, - context, predicate, filter_actions_dag, virtual_columns, info.hive_partition_columns_to_read_from_file_path, nullptr, context->getFileProgressCallback()); + context, predicate, filter_actions_dag, virtual_columns, nullptr, context->getFileProgressCallback()); } }; } @@ -461,10 +390,9 @@ ReadFromFormatInfo StorageObjectStorage::Configuration::prepareReadingFromFormat const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, bool supports_subset_of_columns, - ContextPtr local_context, - const PrepareReadingFromFormatHiveParams & hive_parameters) + ContextPtr local_context) { - return DB::prepareReadingFromFormat(requested_columns, storage_snapshot, local_context, supports_subset_of_columns, hive_parameters); + return DB::prepareReadingFromFormat(requested_columns, storage_snapshot, local_context, supports_subset_of_columns); } std::optional StorageObjectStorage::Configuration::tryGetTableStructureFromMetadata() const @@ -493,25 +421,18 @@ void StorageObjectStorage::read( /* check_consistent_with_previous_metadata */true); } - if (configuration->partition_strategy && configuration->partition_strategy_type != PartitionStrategyFactory::StrategyType::HIVE) + if (partition_by && configuration->withPartitionWildcard()) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Reading from a partitioned {} storage is not implemented yet", getName()); } - auto all_file_columns = file_columns.getAll(); - - auto read_from_format_info = configuration->prepareReadingFromFormat( - object_storage, - column_names, - storage_snapshot, - supportsSubsetOfColumns(local_context), - local_context, - PrepareReadingFromFormatHiveParams { all_file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap() }); + const auto read_from_format_info = configuration->prepareReadingFromFormat( + object_storage, column_names, storage_snapshot, supportsSubsetOfColumns(local_context), local_context); const bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) - && local_context->getSettingsRef()[Setting::optimize_count_from_files]; + && local_context->getSettingsRef()[Setting::optimize_count_from_files]; auto modified_format_settings{format_settings}; if (!modified_format_settings.has_value()) @@ -539,7 +460,7 @@ void StorageObjectStorage::read( } SinkToStoragePtr StorageObjectStorage::write( - const ASTPtr &, + const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context, bool /* async_insert */) @@ -556,39 +477,48 @@ SinkToStoragePtr StorageObjectStorage::write( const auto sample_block = metadata_snapshot->getSampleBlock(); const auto & settings = configuration->getQuerySettings(local_context); - const auto raw_path = configuration->getRawPath(); - if (configuration->isArchive()) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Path '{}' contains archive. Write into archive is not supported", - raw_path.path); + configuration->getPath()); } - if (raw_path.hasGlobsIgnorePartitionWildcard()) + if (configuration->withGlobsIgnorePartitionWildcard()) { throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, - "Non partitioned table with path '{}' that contains globs, the table is in readonly mode", - configuration->getRawPath().path); + "Path '{}' contains globs, so the table is in readonly mode", + configuration->getPath()); } if (!configuration->supportsWrites()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Writes are not supported for engine"); - if (configuration->partition_strategy) + if (configuration->withPartitionWildcard()) { - return std::make_shared(object_storage, configuration, format_settings, sample_block, local_context); + ASTPtr partition_by_ast = partition_by; + if (auto insert_query = std::dynamic_pointer_cast(query)) + { + if (insert_query->partition_by) + partition_by_ast = insert_query->partition_by; + } + + if (partition_by_ast) + { + return std::make_shared( + object_storage, configuration, format_settings, sample_block, local_context, partition_by_ast); + } } auto paths = configuration->getPaths(); - if (auto new_key = checkAndGetNewFileOnInsertIfNeeded(*object_storage, *configuration, settings, paths.front().path, paths.size())) + if (auto new_key = checkAndGetNewFileOnInsertIfNeeded(*object_storage, *configuration, settings, paths.front(), paths.size())) { - paths.push_back({*new_key}); + paths.push_back(*new_key); } configuration->setPaths(paths); return std::make_shared( - paths.back().path, + paths.back(), object_storage, configuration, format_settings, @@ -602,26 +532,24 @@ void StorageObjectStorage::truncate( ContextPtr /* context */, TableExclusiveLockHolder & /* table_holder */) { - const auto path = configuration->getRawPath(); - if (configuration->isArchive()) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Path '{}' contains archive. Table cannot be truncated", - path.path); + configuration->getPath()); } - if (path.hasGlobs()) + if (configuration->withGlobs()) { throw Exception( ErrorCodes::DATABASE_ACCESS_DENIED, "{} key '{}' contains globs, so the table is in readonly mode and cannot be truncated", - getName(), path.path); + getName(), configuration->getPath()); } StoredObjects objects; for (const auto & key : configuration->getPaths()) - objects.emplace_back(key.path); + objects.emplace_back(key); object_storage->removeObjectsIfExist(objects); } @@ -642,7 +570,6 @@ std::unique_ptr StorageObjectStorage::createReadBufferIterat {}/* predicate */, {}, {}/* virtual_columns */, - {}, /* hive_columns */ &read_keys); return std::make_unique( @@ -740,23 +667,6 @@ void StorageObjectStorage::Configuration::initialize( else configuration_to_initialize.fromAST(engine_args, local_context, with_table_structure); - if (configuration_to_initialize.isNamespaceWithGlobs()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Expression can not have wildcards inside {} name", configuration_to_initialize.getNamespaceType()); - - if (configuration_to_initialize.isDataLakeConfiguration()) - { - if (configuration_to_initialize.partition_strategy_type != PartitionStrategyFactory::StrategyType::NONE) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "The `partition_strategy` argument is incompatible with data lakes"); - } - } - else if (configuration_to_initialize.partition_strategy_type == PartitionStrategyFactory::StrategyType::NONE) - { - // Promote to wildcard in case it is not data lake to make it backwards compatible - configuration_to_initialize.partition_strategy_type = PartitionStrategyFactory::StrategyType::WILDCARD; - } - if (configuration_to_initialize.format == "auto") { if (configuration_to_initialize.isDataLakeConfiguration()) @@ -767,96 +677,48 @@ void StorageObjectStorage::Configuration::initialize( { configuration_to_initialize.format = FormatFactory::instance() - .tryGetFormatFromFileName(configuration_to_initialize.isArchive() ? configuration_to_initialize.getPathInArchive() : configuration_to_initialize.getRawPath().path) + .tryGetFormatFromFileName(configuration_to_initialize.isArchive() ? configuration_to_initialize.getPathInArchive() : configuration_to_initialize.getPath()) .value_or("auto"); } } else FormatFactory::instance().checkFormatName(configuration_to_initialize.format); - /// It might be changed on `StorageObjectStorage::Configuration::initPartitionStrategy` - configuration_to_initialize.read_path = configuration_to_initialize.getRawPath(); configuration_to_initialize.initialized = true; - -} - -void StorageObjectStorage::Configuration::initPartitionStrategy(ASTPtr partition_by, const ColumnsDescription & columns, ContextPtr context) -{ - partition_strategy = PartitionStrategyFactory::get( - partition_strategy_type, - partition_by, - columns.getOrdinary(), - context, - format, - getRawPath().hasGlobs(), - getRawPath().hasPartitionWildcard(), - partition_columns_in_data_file); - - if (partition_strategy) - { - read_path = partition_strategy->getPathForRead(getRawPath().path); - LOG_DEBUG(getLogger("StorageObjectStorageConfiguration"), "Initialized partition strategy {}", magic_enum::enum_name(partition_strategy_type)); - } } -const StorageObjectStorage::Configuration::Path & StorageObjectStorage::Configuration::getPathForRead() const -{ - return read_path; -} - -StorageObjectStorage::Configuration::Path StorageObjectStorage::Configuration::getPathForWrite(const std::string & partition_id) const +void StorageObjectStorage::Configuration::check(ContextPtr) const { - auto raw_path = getRawPath(); - - if (!partition_strategy) - { - return raw_path; - } - - return Path {partition_strategy->getPathForWrite(raw_path.path, partition_id)}; + FormatFactory::instance().checkFormatName(format); } - -bool StorageObjectStorage::Configuration::Path::hasPartitionWildcard() const +bool StorageObjectStorage::Configuration::withPartitionWildcard() const { static const String PARTITION_ID_WILDCARD = "{_partition_id}"; - return path.find(PARTITION_ID_WILDCARD) != String::npos; -} - -bool StorageObjectStorage::Configuration::Path::hasGlobsIgnorePartitionWildcard() const -{ - if (!hasPartitionWildcard()) - return hasGlobs(); - return PartitionedSink::replaceWildcards(path, "").find_first_of("*?{") != std::string::npos; + return getPath().find(PARTITION_ID_WILDCARD) != String::npos + || getNamespace().find(PARTITION_ID_WILDCARD) != String::npos; } -bool StorageObjectStorage::Configuration::Path::hasGlobs() const +bool StorageObjectStorage::Configuration::withGlobsIgnorePartitionWildcard() const { - return path.find_first_of("*?{") != std::string::npos; + if (!withPartitionWildcard()) + return withGlobs(); + return PartitionedSink::replaceWildcards(getPath(), "").find_first_of("*?{") != std::string::npos; } -std::string StorageObjectStorage::Configuration::Path::cutGlobs(bool supports_partial_prefix) const +bool StorageObjectStorage::Configuration::isPathWithGlobs() const { - if (supports_partial_prefix) - { - return path.substr(0, path.find_first_of("*?{")); - } - - auto first_glob_pos = path.find_first_of("*?{"); - auto end_of_path_without_globs = path.substr(0, first_glob_pos).rfind('/'); - if (end_of_path_without_globs == std::string::npos || end_of_path_without_globs == 0) - return "/"; - return path.substr(0, end_of_path_without_globs); + return getPath().find_first_of("*?{") != std::string::npos; } -void StorageObjectStorage::Configuration::check(ContextPtr) const +bool StorageObjectStorage::Configuration::isNamespaceWithGlobs() const { - FormatFactory::instance().checkFormatName(format); + return getNamespace().find_first_of("*?{") != std::string::npos; } -bool StorageObjectStorage::Configuration::isNamespaceWithGlobs() const +std::string StorageObjectStorage::Configuration::getPathWithoutGlobs() const { - return getNamespace().find_first_of("*?{") != std::string::npos; + return getPath().substr(0, getPath().find_first_of("*?{")); } bool StorageObjectStorage::Configuration::isPathInArchiveWithGlobs() const @@ -866,7 +728,7 @@ bool StorageObjectStorage::Configuration::isPathInArchiveWithGlobs() const std::string StorageObjectStorage::Configuration::getPathInArchive() const { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} is not archive", getRawPath().path); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} is not archive", getPath()); } void StorageObjectStorage::Configuration::assertInitialized() const diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 0c75431cd1c1..e7eb1ef3e60f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -16,16 +16,14 @@ #include -#include - namespace DB { + class ReadBufferIterator; class SchemaCache; class NamedCollection; struct StorageObjectStorageSettings; using StorageObjectStorageSettingsPtr = std::shared_ptr; -struct IPartitionStrategy; namespace ErrorCodes { @@ -70,7 +68,7 @@ class StorageObjectStorage : public IStorage ObjectStoragePtr object_storage_, ContextPtr context_, const StorageID & table_id_, - const ColumnsDescription & columns_in_table_or_function_definition, + const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, std::optional format_settings_, @@ -169,6 +167,8 @@ class StorageObjectStorage : public IStorage /// `object_storage` to allow direct access to data storage. const ObjectStoragePtr object_storage; const std::optional format_settings; + /// Partition by expression from CREATE query. + const ASTPtr partition_by; /// Whether this engine is a part of according Cluster engine implementation. /// (One of the reading replicas, not the initiator). const bool distributed_processing; @@ -176,9 +176,6 @@ class StorageObjectStorage : public IStorage /// (e.g. refresh configuration) on each read() method call. bool update_configuration_on_read_write = true; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; - ColumnsDescription file_columns; - LoggerPtr log; }; @@ -188,21 +185,7 @@ class StorageObjectStorage::Configuration Configuration() = default; virtual ~Configuration() = default; - struct Path - { - Path() = default; - /// A partial prefix is a prefix that does not represent an actual object (directory or file), usually strings that do not end with a slash character. - /// Example: `table_root/year=20`. AWS S3 supports partial prefixes, but HDFS does not. - Path(const std::string & path_) : path(path_) {} /// NOLINT(google-explicit-constructor) - - std::string path; - - bool hasPartitionWildcard() const; - bool hasGlobsIgnorePartitionWildcard() const; - bool hasGlobs() const; - std::string cutGlobs(bool supports_partial_prefix) const; - }; - + using Path = std::string; using Paths = std::vector; /// Initialize configuration from either AST or NamedCollection. @@ -221,25 +204,11 @@ class StorageObjectStorage::Configuration /// buckets in S3. If object storage doesn't have any namepaces return empty string. virtual std::string getNamespaceType() const { return "namespace"; } - // Path provided by the user in the query - virtual Path getRawPath() const = 0; - - const Path & getPathForRead() const; - // Path used for writing, it should not be globbed and might contain a partition key - Path getPathForWrite(const std::string & partition_id = "") const; + virtual Path getFullPath() const { return ""; } + virtual Path getPath() const = 0; + virtual void setPath(const Path & path) = 0; - void setPathForRead(const Path & path) - { - read_path = path; - } - - /* - * When using `s3_create_new_file_on_insert`, each new file path generated will be appended to the path list. - * This list is used to determine the next file name and the set of files that shall be read from remote storage. - * This is not ideal, there are much better ways to implement reads and writes. It should be eventually removed - */ virtual const Paths & getPaths() const = 0; - virtual void setPaths(const Paths & paths) = 0; virtual String getDataSourceDescription() const = 0; @@ -251,7 +220,12 @@ class StorageObjectStorage::Configuration virtual void addStructureAndFormatToArgsIfNeeded( ASTs & args, const String & structure_, const String & format_, ContextPtr context, bool with_structure) = 0; + bool withPartitionWildcard() const; + bool withGlobs() const { return isPathWithGlobs() || isNamespaceWithGlobs(); } + bool withGlobsIgnorePartitionWildcard() const; + bool isPathWithGlobs() const; bool isNamespaceWithGlobs() const; + virtual std::string getPathWithoutGlobs() const; virtual bool isArchive() const { return false; } bool isPathInArchiveWithGlobs() const; @@ -283,18 +257,13 @@ class StorageObjectStorage::Configuration const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, bool supports_subset_of_columns, - ContextPtr local_context, - const PrepareReadingFromFormatHiveParams & hive_parameters); - - void initPartitionStrategy(ASTPtr partition_by, const ColumnsDescription & columns, ContextPtr context); + ContextPtr local_context); virtual std::optional tryGetTableStructureFromMetadata() const; virtual bool supportsFileIterator() const { return false; } virtual bool supportsWrites() const { return true; } - virtual bool supportsPartialPathPrefix() const { return true; } - virtual ObjectIterator iterate( const ActionsDAG * /* filter_dag */, std::function /* callback */, @@ -319,11 +288,6 @@ class StorageObjectStorage::Configuration String format = "auto"; String compression_method = "auto"; String structure = "auto"; - PartitionStrategyFactory::StrategyType partition_strategy_type = PartitionStrategyFactory::StrategyType::NONE; - /// Whether partition column values are contained in the actual data. - /// And alternative is with hive partitioning, when they are contained in file path. - bool partition_columns_in_data_file = true; - std::shared_ptr partition_strategy; protected: virtual void fromNamedCollection(const NamedCollection & collection, ContextPtr context) = 0; @@ -332,12 +296,6 @@ class StorageObjectStorage::Configuration void assertInitialized() const; bool initialized = false; - -private: - // Path used for reading, by default it is the same as `getRawPath` - // When using `partition_strategy=hive`, a recursive reading pattern will be appended `'table_root/**.parquet' - Path read_path; - }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 424b0d5bfb52..511b9411b2ac 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -9,10 +9,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -29,10 +27,9 @@ namespace Setting namespace ErrorCodes { extern const int LOGICAL_ERROR; - extern const int INCORRECT_DATA; } -String StorageObjectStorageCluster::getPathSample(ContextPtr context) +String StorageObjectStorageCluster::getPathSample(StorageInMemoryMetadata metadata, ContextPtr context) { auto query_settings = configuration->getQuerySettings(context); /// We don't want to throw an exception if there are no files with specified path. @@ -45,8 +42,7 @@ String StorageObjectStorageCluster::getPathSample(ContextPtr context) context, {}, // predicate {}, - {}, // virtual_columns - {}, // hive_columns + metadata.getColumns().getAll(), // virtual_columns nullptr, // read_keys {} // file_progress_callback ); @@ -61,16 +57,14 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( ConfigurationPtr configuration_, ObjectStoragePtr object_storage_, const StorageID & table_id_, - const ColumnsDescription & columns_in_table_or_function_definition, + const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const ASTPtr & partition_by, ContextPtr context_) : IStorageCluster( cluster_name_, table_id_, getLogger(fmt::format("{}({})", configuration_->getEngineName(), table_id_.table_name))) , configuration{configuration_} , object_storage(object_storage_) { - configuration->initPartitionStrategy(partition_by, columns_in_table_or_function_definition, context_); /// We allow exceptions to be thrown on update(), /// because Cluster engine can only be used as table function, /// so no lazy initialization is allowed. @@ -80,51 +74,20 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( /* if_not_updated_before */false, /* check_consistent_with_previous_metadata */true); - ColumnsDescription columns{columns_in_table_or_function_definition}; + ColumnsDescription columns{columns_}; std::string sample_path; resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, {}, sample_path, context_); configuration->check(context_); - if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning] && !configuration->isDataLakeConfiguration() && !configuration->partition_strategy) - sample_path = getPathSample(context_); - - /* - * If `partition_strategy=hive`, the partition columns shall be extracted from the `PARTITION BY` expression. - * There is no need to read from the filepath. - * - * Otherwise, in case `use_hive_partitioning=1`, we can keep the old behavior of extracting it from the sample path. - * And if the schema was inferred (not specified in the table definition), we need to enrich it with the path partition columns - */ - if (configuration->partition_strategy && configuration->partition_strategy_type == PartitionStrategyFactory::StrategyType::HIVE) - { - hive_partition_columns_to_read_from_file_path = configuration->partition_strategy->getPartitionColumns(); - } - else if (context_->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - columns_in_table_or_function_definition.empty(), - std::nullopt, - context_ - ); - } - - if (hive_partition_columns_to_read_from_file_path.size() == columns.size()) - { - throw Exception( - ErrorCodes::INCORRECT_DATA, - "A hive partitioned file can't contain only partition columns. Try reading it with `partition_strategy=wildcard` and `use_hive_partitioning=0`"); - } - - /// Hive: Not building the file_columns like `StorageObjectStorage` does because it is not necessary to do it here. - StorageInMemoryMetadata metadata; metadata.setColumns(columns); metadata.setConstraints(constraints_); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.columns)); + if (sample_path.empty() && context_->getSettingsRef()[Setting::use_hive_partitioning] && !configuration->isDataLakeConfiguration()) + sample_path = getPathSample(metadata, context_); + + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage( + metadata.columns, context_, sample_path, std::nullopt, configuration->isDataLakeConfiguration())); setInMemoryMetadata(metadata); } @@ -218,7 +181,7 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten { auto iterator = StorageObjectStorageSource::createFileIterator( configuration, configuration->getQuerySettings(local_context), object_storage, /* distributed_processing */false, - local_context, predicate, {}, virtual_columns, hive_partition_columns_to_read_from_file_path, nullptr, local_context->getFileProgressCallback(), /*ignore_archive_globs=*/true, /*skip_object_metadata=*/true); + local_context, predicate, {}, virtual_columns, nullptr, local_context->getFileProgressCallback(), /*ignore_archive_globs=*/true, /*skip_object_metadata=*/true); auto task_distributor = std::make_shared(iterator, number_of_replicas); diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 1a557143076a..4f4d541008f1 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -17,9 +17,8 @@ class StorageObjectStorageCluster : public IStorageCluster ConfigurationPtr configuration_, ObjectStoragePtr object_storage_, const StorageID & table_id_, - const ColumnsDescription & columns_in_table_or_function_definition, + const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const ASTPtr & partition_by, ContextPtr context_); std::string getName() const override; @@ -27,7 +26,7 @@ class StorageObjectStorageCluster : public IStorageCluster RemoteQueryExecutor::Extension getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ContextPtr & context, size_t number_of_replicas) const override; - String getPathSample(ContextPtr context); + String getPathSample(StorageInMemoryMetadata metadata, ContextPtr context); std::optional totalRows(ContextPtr query_context) const override; std::optional totalBytes(ContextPtr query_context) const override; @@ -42,7 +41,6 @@ class StorageObjectStorageCluster : public IStorageCluster const StorageObjectStorage::ConfigurationPtr configuration; const ObjectStoragePtr object_storage; NamesAndTypesList virtual_columns; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; }; } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp index e23787b5aa6b..6bb10710c342 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.cpp @@ -21,34 +21,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -namespace -{ - void validateKey(const String & str) - { - /// See: - /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html - /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject - - if (str.empty() || str.size() > 1024) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); - - if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); - - PartitionedSink::validatePartitionKey(str, true); - } - - void validateNamespace(const String & str, StorageObjectStorage::ConfigurationPtr configuration) - { - configuration->validateNamespace(str); - - if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); - - PartitionedSink::validatePartitionKey(str, false); - } -} - StorageObjectStorageSink::StorageObjectStorageSink( const std::string & path_, ObjectStoragePtr object_storage, @@ -132,8 +104,9 @@ PartitionedStorageObjectStorageSink::PartitionedStorageObjectStorageSink( ConfigurationPtr configuration_, std::optional format_settings_, const Block & sample_block_, - ContextPtr context_) - : PartitionedSink(configuration_->partition_strategy, context_, sample_block_) + ContextPtr context_, + const ASTPtr & partition_by) + : PartitionedSink(partition_by, context_, sample_block_) , object_storage(object_storage_) , configuration(configuration_) , query_settings(configuration_->getQuerySettings(context_)) @@ -151,25 +124,51 @@ StorageObjectStorageSink::~StorageObjectStorageSink() SinkPtr PartitionedStorageObjectStorageSink::createSinkForPartition(const String & partition_id) { - auto file_path = configuration->getPathForWrite(partition_id).path; + auto partition_bucket = replaceWildcards(configuration->getNamespace(), partition_id); + validateNamespace(partition_bucket); - validateNamespace(configuration->getNamespace(), configuration); - validateKey(file_path); + auto partition_key = replaceWildcards(configuration->getPath(), partition_id); + validateKey(partition_key); if (auto new_key = checkAndGetNewFileOnInsertIfNeeded( - *object_storage, *configuration, query_settings, file_path, /* sequence_number */1)) + *object_storage, *configuration, query_settings, partition_key, /* sequence_number */1)) { - file_path = *new_key; + partition_key = *new_key; } return std::make_shared( - file_path, + partition_key, object_storage, configuration, format_settings, - partition_strategy->getFormatHeader(), + sample_block, context ); } +void PartitionedStorageObjectStorageSink::validateKey(const String & str) +{ + /// See: + /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html + /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject + + if (str.empty() || str.size() > 1024) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); + + if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); + + validatePartitionKey(str, true); +} + +void PartitionedStorageObjectStorageSink::validateNamespace(const String & str) +{ + configuration->validateNamespace(str); + + if (!UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); + + validatePartitionKey(str, false); +} + } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSink.h b/src/Storages/ObjectStorage/StorageObjectStorageSink.h index ebfee5ab96e6..edfda17fd19b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSink.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSink.h @@ -48,11 +48,15 @@ class PartitionedStorageObjectStorageSink : public PartitionedSink ConfigurationPtr configuration_, std::optional format_settings_, const Block & sample_block_, - ContextPtr context_); + ContextPtr context_, + const ASTPtr & partition_by); SinkPtr createSinkForPartition(const String & partition_id) override; private: + void validateKey(const String & str); + void validateNamespace(const String & str); + ObjectStoragePtr object_storage; ConfigurationPtr configuration; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index b4ae724abd03..4e10bd12e40d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -22,13 +22,11 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include @@ -132,7 +130,6 @@ std::shared_ptr StorageObjectStorageSource::createFileIterator( const ActionsDAG::Node * predicate, const std::optional & filter_actions_dag, const NamesAndTypesList & virtual_columns, - const NamesAndTypesList & hive_columns, ObjectInfos * read_keys, std::function file_progress_callback, bool ignore_archive_globs, @@ -150,13 +147,17 @@ std::shared_ptr StorageObjectStorageSource::createFileIterator( return distributed_iterator; } + if (configuration->isNamespaceWithGlobs()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Expression can not have wildcards inside {} name", configuration->getNamespaceType()); + std::unique_ptr iterator; - const auto & reading_path = configuration->getPathForRead(); - if (reading_path.hasGlobs()) + if (configuration->isPathWithGlobs()) { - if (hasExactlyOneBracketsExpansion(reading_path.path)) + auto path = configuration->getPath(); + if (hasExactlyOneBracketsExpansion(path)) { - auto paths = expandSelectionGlob(reading_path.path); + auto paths = expandSelectionGlob(configuration->getPath()); iterator = std::make_unique( paths, object_storage, virtual_columns, is_archive ? nullptr : read_keys, query_settings.ignore_non_existent_file, skip_object_metadata, file_progress_callback); @@ -164,7 +165,7 @@ std::shared_ptr StorageObjectStorageSource::createFileIterator( else /// Iterate through disclosed globs and make a source for each file iterator = std::make_unique( - object_storage, configuration, predicate, virtual_columns, hive_columns, + object_storage, configuration, predicate, virtual_columns, local_context, is_archive ? nullptr : read_keys, query_settings.list_object_keys_size, query_settings.throw_on_zero_files_match, file_progress_callback); } @@ -180,36 +181,22 @@ std::shared_ptr StorageObjectStorageSource::createFileIterator( { Strings paths; - auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, hive_columns); + auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); if (filter_dag) { - const auto configuration_paths = configuration->getPaths(); - - std::vector keys; - keys.reserve(configuration_paths.size()); - - for (const auto & path: configuration_paths) - { - keys.emplace_back(path.path); - } - + auto keys = configuration->getPaths(); paths.reserve(keys.size()); for (const auto & key : keys) paths.push_back(fs::path(configuration->getNamespace()) / key); VirtualColumnUtils::buildSetsForDAG(*filter_dag, local_context); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns, hive_columns, local_context); + VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns, local_context); paths = keys; } else { - const auto configuration_paths = configuration->getPaths(); - paths.reserve(configuration_paths.size()); - for (const auto & path: configuration_paths) - { - paths.emplace_back(path.path); - } + paths = configuration->getPaths(); } iterator = std::make_unique( @@ -266,34 +253,21 @@ Chunk StorageObjectStorageSource::generate() const auto & filename = object_info->getFileName(); std::string full_path = object_info->getPath(); - const auto reading_path = configuration->getPathForRead().path; - - if (!full_path.starts_with(reading_path)) - full_path = fs::path(reading_path) / object_info->getPath(); + if (!full_path.starts_with(configuration->getPath())) + full_path = fs::path(configuration->getPath()) / object_info->getPath(); chassert(object_info->metadata); - const auto path = getUniqueStoragePathIdentifier(*configuration, *object_info, false); - VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, - {.path = path, + {.path = getUniqueStoragePathIdentifier(*configuration, *object_info, false), .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes, .filename = &filename, .last_modified = object_info->metadata->last_modified, .etag = &(object_info->metadata->etag)}, read_context); - // The order is important, it must be added after virtual columns.. - if (!read_from_format_info.hive_partition_columns_to_read_from_file_path.empty()) - { - HivePartitioningUtils::addPartitionColumnsToChunk( - chunk, - read_from_format_info.hive_partition_columns_to_read_from_file_path, - path); - } - if (chunk_size && chunk.hasColumns()) { const auto * object_with_partition_columns_info = dynamic_cast(object_info.get()); @@ -718,7 +692,6 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( ConfigurationPtr configuration_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - const NamesAndTypesList & hive_columns_, ContextPtr context_, ObjectInfos * read_keys_, size_t list_object_keys_size, @@ -728,29 +701,31 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( , object_storage(object_storage_) , configuration(configuration_) , virtual_columns(virtual_columns_) - , hive_columns(hive_columns_) , throw_on_zero_files_match(throw_on_zero_files_match_) , log(getLogger("GlobIterator")) , read_keys(read_keys_) , local_context(context_) , file_progress_callback(file_progress_callback_) { - const auto & reading_path = configuration->getPathForRead(); - if (reading_path.hasGlobs()) + if (configuration->isNamespaceWithGlobs()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); + } + if (configuration->isPathWithGlobs()) { - const auto & key_with_globs = reading_path; - const auto key_prefix = reading_path.cutGlobs(configuration->supportsPartialPathPrefix()); + const auto key_with_globs = configuration_->getPath(); + const auto key_prefix = configuration->getPathWithoutGlobs(); object_storage_iterator = object_storage->iterate(key_prefix, list_object_keys_size); - matcher = std::make_unique(makeRegexpPatternFromGlobs(key_with_globs.path)); + matcher = std::make_unique(makeRegexpPatternFromGlobs(key_with_globs)); if (!matcher->ok()) { - throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, "Cannot compile regex from glob ({}): {}", key_with_globs.path, matcher->error()); + throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, "Cannot compile regex from glob ({}): {}", key_with_globs, matcher->error()); } - recursive = key_with_globs.path == "/**"; - if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, hive_columns)) + recursive = key_with_globs == "/**"; + if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns)) { VirtualColumnUtils::buildSetsForDAG(*filter_dag, getContext()); filter_expr = std::make_shared(std::move(*filter_dag)); @@ -761,7 +736,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( throw Exception( ErrorCodes::BAD_ARGUMENTS, "Using glob iterator with path without globs is not allowed (used path: {})", - reading_path.path); + configuration->getPath()); } } @@ -786,7 +761,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne { throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Can not match any files with path {}", - configuration->getPathForRead().path); + configuration->getPath()); } first_iteration = false; return object_info; @@ -826,7 +801,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne for (const auto & object_info : new_batch) paths.push_back(getUniqueStoragePathIdentifier(*configuration, *object_info, false)); - VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns, hive_columns, local_context); + VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns, local_context); LOG_TEST(log, "Filtered files: {} -> {}", paths.size(), new_batch.size()); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 6221c1508100..dc7f1a23ae08 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -59,7 +59,6 @@ class StorageObjectStorageSource : public SourceWithKeyCondition const ActionsDAG::Node * predicate, const std::optional & filter_actions_dag, const NamesAndTypesList & virtual_columns, - const NamesAndTypesList & hive_columns, ObjectInfos * read_keys, std::function file_progress_callback = {}, bool ignore_archive_globs = false, @@ -176,7 +175,6 @@ class StorageObjectStorageSource::GlobIterator : public IObjectIterator, WithCon ConfigurationPtr configuration_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - const NamesAndTypesList & hive_columns_, ContextPtr context_, ObjectInfos * read_keys_, size_t list_object_keys_size, @@ -197,7 +195,6 @@ class StorageObjectStorageSource::GlobIterator : public IObjectIterator, WithCon const ObjectStoragePtr object_storage; const ConfigurationPtr configuration; const NamesAndTypesList virtual_columns; - const NamesAndTypesList hive_columns; const bool throw_on_zero_files_match; const LoggerPtr log; diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp index 99f213cc51db..180573b2e119 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp @@ -103,18 +103,16 @@ ObjectStorageQueueSource::FileIterator::FileIterator( if (configuration->isNamespaceWithGlobs()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside namespace name"); - const auto & reading_path = configuration->getPathForRead(); - - if (!reading_path.hasGlobs()) + if (!configuration->isPathWithGlobs()) { throw Exception( ErrorCodes::BAD_ARGUMENTS, "Using glob iterator with path without globs is not allowed (used path: {})", - reading_path.path); + configuration->getPath()); } - const auto globbed_key = reading_path.path; - object_storage_iterator = object_storage->iterate(reading_path.cutGlobs(configuration->supportsPartialPathPrefix()), list_objects_batch_size_); + const auto globbed_key = configuration_->getPath(); + object_storage_iterator = object_storage->iterate(configuration->getPathWithoutGlobs(), list_objects_batch_size_); matcher = std::make_unique(makeRegexpPatternFromGlobs(globbed_key)); if (!matcher->ok()) @@ -200,10 +198,8 @@ ObjectStorageQueueSource::FileIterator::next() for (const auto & object_info : new_batch) paths.push_back(Source::getUniqueStoragePathIdentifier(*configuration, *object_info, false)); - /// Hive partition columns were not being used in ObjectStorageQueue before the refactoring from (virtual -> physical). - /// So we are keeping it the way it is for now VirtualColumnUtils::filterByPathOrFile( - new_batch, paths, filter_expr, virtual_columns, /* hive partition columns */{}, getContext()); + new_batch, paths, filter_expr, virtual_columns, getContext()); LOG_TEST(log, "Filtered files: {} -> {} by path or filename", paths.size(), new_batch.size()); } diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp index 2c78f7257aeb..bd618d4f61d8 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp @@ -210,16 +210,15 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( , reschedule_processing_interval_ms((*queue_settings_)[ObjectStorageQueueSetting::polling_min_timeout_ms]) , log(getLogger(fmt::format("Storage{}Queue ({})", configuration->getEngineName(), table_id_.getFullTableName()))) { - const auto & read_path = configuration->getPathForRead(); - if (read_path.path.empty()) + if (configuration->getPath().empty()) { - configuration->setPathForRead({"/*"}); + configuration->setPath("/*"); } - else if (read_path.path.ends_with('/')) + else if (configuration->getPath().ends_with('/')) { - configuration->setPathForRead({read_path.path + '*'}); + configuration->setPath(configuration->getPath() + '*'); } - else if (!read_path.hasGlobs()) + else if (!configuration->isPathWithGlobs()) { throw Exception(ErrorCodes::BAD_QUERY_PARAMETER, "ObjectStorageQueue url must either end with '/' or contain globs"); } @@ -242,7 +241,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( storage_metadata.setComment(comment); if (engine_args->settings) storage_metadata.settings_changes = engine_args->settings->ptr(); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context_)); setInMemoryMetadata(storage_metadata); LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); diff --git a/src/Storages/PartitionedSink.cpp b/src/Storages/PartitionedSink.cpp index ec43e4b4ca1f..9b92d79e9c28 100644 --- a/src/Storages/PartitionedSink.cpp +++ b/src/Storages/PartitionedSink.cpp @@ -3,9 +3,13 @@ #include "PartitionedSink.h" #include -#include #include +#include +#include +#include + +#include #include @@ -17,18 +21,22 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_PARSE_TEXT; - extern const int INCORRECT_DATA; } PartitionedSink::PartitionedSink( - std::shared_ptr partition_strategy_, + const ASTPtr & partition_by, ContextPtr context_, const Block & sample_block_) : SinkToStorage(sample_block_) - , partition_strategy(partition_strategy_) , context(context_) , sample_block(sample_block_) { + ASTs arguments(1, partition_by); + ASTPtr partition_by_string = makeASTFunction("toString", std::move(arguments)); + + auto syntax_result = TreeRewriter(context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); + partition_by_expr = ExpressionAnalyzer(partition_by_string, syntax_result, context).getActions(false); + partition_by_column_name = partition_by_string->getColumnName(); } @@ -44,21 +52,17 @@ SinkPtr PartitionedSink::getSinkForPartitionKey(StringRef partition_key) return it->second; } -void PartitionedSink::consume(Chunk & source_chunk) +void PartitionedSink::consume(Chunk & chunk) { - const ColumnPtr partition_by_result_column = partition_strategy->computePartitionKey(source_chunk); + const auto & columns = chunk.getColumns(); - /// Not all columns are serialized using the format writer (e.g, hive partitioning stores partition columns in the file path) - const auto columns_to_consume = partition_strategy->getFormatChunkColumns(source_chunk); + Block block_with_partition_by_expr = sample_block.cloneWithoutColumns(); + block_with_partition_by_expr.setColumns(columns); + partition_by_expr->execute(block_with_partition_by_expr); - if (columns_to_consume.empty()) - { - throw Exception(ErrorCodes::INCORRECT_DATA, - "No column to write as all columns are specified as partition columns. " - "Consider setting `partition_columns_in_data_file=1`"); - } + const auto * partition_by_result_column = block_with_partition_by_expr.getByName(partition_by_column_name).column.get(); - size_t chunk_rows = source_chunk.getNumRows(); + size_t chunk_rows = chunk.getNumRows(); chunk_row_index_to_partition_index.resize(chunk_rows); partition_id_to_chunk_index.clear(); @@ -73,7 +77,7 @@ void PartitionedSink::consume(Chunk & source_chunk) chunk_row_index_to_partition_index[row] = it->getMapped(); } - size_t columns_size = columns_to_consume.size(); + size_t columns_size = columns.size(); size_t partitions_size = partition_id_to_chunk_index.size(); Chunks partition_index_to_chunk; @@ -81,7 +85,7 @@ void PartitionedSink::consume(Chunk & source_chunk) for (size_t column_index = 0; column_index < columns_size; ++column_index) { - MutableColumns partition_index_to_column_split = columns_to_consume[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); + MutableColumns partition_index_to_column_split = columns[column_index]->scatter(partitions_size, chunk_row_index_to_partition_index); /// Add chunks into partition_index_to_chunk with sizes of result columns if (column_index == 0) diff --git a/src/Storages/PartitionedSink.h b/src/Storages/PartitionedSink.h index 481230792db0..af10fc9dcd82 100644 --- a/src/Storages/PartitionedSink.h +++ b/src/Storages/PartitionedSink.h @@ -8,7 +8,6 @@ #include #include #include -#include namespace DB @@ -19,10 +18,7 @@ class PartitionedSink : public SinkToStorage public: static constexpr auto PARTITION_ID_WILDCARD = "{_partition_id}"; - PartitionedSink( - std::shared_ptr partition_strategy_, - ContextPtr context_, - const Block & sample_block_); + PartitionedSink(const ASTPtr & partition_by, ContextPtr context_, const Block & sample_block_); ~PartitionedSink() override; @@ -40,13 +36,13 @@ class PartitionedSink : public SinkToStorage static String replaceWildcards(const String & haystack, const String & partition_id); -protected: - std::shared_ptr partition_strategy; - private: ContextPtr context; Block sample_block; + ExpressionActionsPtr partition_by_expr; + String partition_by_column_name; + absl::flat_hash_map partition_id_to_sink; HashMapWithSavedHash partition_id_to_chunk_index; IColumn::Selector chunk_row_index_to_partition_index; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 5a3637ec33c2..e5d38f374c3d 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -6,10 +6,8 @@ #include #include #include -#include #include -#include #include #include #include @@ -73,8 +71,6 @@ #include -#include - namespace ProfileEvents { extern const Event CreatedReadBufferOrdinary; @@ -108,7 +104,6 @@ namespace Setting extern const SettingsBool use_cache_for_count_from_files; extern const SettingsInt64 zstd_window_log_max; extern const SettingsBool enable_parsing_to_custom_serialization; - extern const SettingsBool use_hive_partitioning; } namespace ErrorCodes @@ -1144,25 +1139,7 @@ void StorageFile::setStorageMetadata(CommonArguments args) storage_metadata.setConstraints(args.constraints); storage_metadata.setComment(args.comment); - const auto sample_path = paths.empty() ? "" : paths[0]; - - auto & storage_columns = storage_metadata.columns; - - if (args.getContext()->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - args.columns.empty(), - format_settings, - args.getContext()); - } - - /// If the `partition_strategy` argument is ever implemented for File storage, this must be updated - file_columns = storage_columns.getAllPhysical(); - - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, args.getContext(), paths.empty() ? "" : paths[0], format_settings)); setInMemoryMetadata(storage_metadata); } @@ -1182,20 +1159,19 @@ StorageFileSource::FilesIterator::FilesIterator( std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - const NamesAndTypesList & hive_columns, const ContextPtr & context_, bool distributed_processing_) : WithContext(context_), files(files_), archive_info(std::move(archive_info_)), distributed_processing(distributed_processing_) { std::optional filter_dag; if (!distributed_processing && !archive_info && !files.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, hive_columns); + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); if (filter_dag) { VirtualColumnUtils::buildSetsForDAG(*filter_dag, context_); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns, hive_columns, context_); + VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns, context_); } } @@ -1238,7 +1214,6 @@ StorageFileSource::StorageFileSource( , requested_virtual_columns(info.requested_virtual_columns) , block_for_format(info.format_header) , serialization_hints(info.serialization_hints) - , hive_partition_columns_to_read_from_file_path(info.hive_partition_columns_to_read_from_file_path) , max_block_size(max_block_size_) , need_only_count(need_only_count_) { @@ -1521,15 +1496,6 @@ Chunk StorageFileSource::generate() .last_modified = current_file_last_modified }, getContext()); - // The order is important, it must be added after virtual columns.. - if (!hive_partition_columns_to_read_from_file_path.empty()) - { - HivePartitioningUtils::addPartitionColumnsToChunk( - chunk, - hive_partition_columns_to_read_from_file_path, - current_path); - } - return chunk; } @@ -1672,13 +1638,7 @@ void StorageFile::read( auto this_ptr = std::static_pointer_cast(shared_from_this()); - auto read_from_format_info = prepareReadingFromFormat( - column_names, - storage_snapshot, - context, - supportsSubsetOfColumns(context), - PrepareReadingFromFormatHiveParams {file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap()}); - + auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, context, supportsSubsetOfColumns(context)); bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && context->getSettingsRef()[Setting::optimize_count_from_files]; @@ -1707,7 +1667,6 @@ void ReadFromFile::createIterator(const ActionsDAG::Node * predicate) storage->archive_info, predicate, storage->getVirtualsList(), - info.hive_partition_columns_to_read_from_file_path, context, storage->distributed_processing); } @@ -1946,7 +1905,7 @@ class PartitionedStorageFileSink : public PartitionedSink { public: PartitionedStorageFileSink( - std::shared_ptr partition_strategy_, + const ASTPtr & partition_by, const StorageMetadataPtr & metadata_snapshot_, const String & table_name_for_log_, std::unique_lock && lock_, @@ -1957,7 +1916,7 @@ class PartitionedStorageFileSink : public PartitionedSink const String format_name_, ContextPtr context_, int flags_) - : PartitionedSink(partition_strategy_, context_, metadata_snapshot_->getSampleBlock()) + : PartitionedSink(partition_by, context_, metadata_snapshot_->getSampleBlock()) , path(path_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) @@ -1973,19 +1932,19 @@ class PartitionedStorageFileSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - std::string filepath = partition_strategy->getPathForWrite(path, partition_id); + auto partition_path = PartitionedSink::replaceWildcards(path, partition_id); - fs::create_directories(fs::path(filepath).parent_path()); + fs::create_directories(fs::path(partition_path).parent_path()); - validatePartitionKey(filepath, true); - checkCreationIsAllowed(context, context->getUserFilesPath(), filepath, /*can_be_directory=*/ true); + PartitionedSink::validatePartitionKey(partition_path, true); + checkCreationIsAllowed(context, context->getUserFilesPath(), partition_path, /*can_be_directory=*/ true); return std::make_shared( metadata_snapshot, table_name_for_log, -1, /* use_table_fd */false, base_path, - filepath, + partition_path, compression_method, format_settings, format_name, @@ -2035,18 +1994,8 @@ SinkToStoragePtr StorageFile::write( if (path_for_partitioned_write.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty path for partitioned write"); - auto partition_strategy = PartitionStrategyFactory::get( - PartitionStrategyFactory::StrategyType::WILDCARD, - insert_query->partition_by, - metadata_snapshot->getColumns().getAll(), - context, - format_name, - is_path_with_globs, - has_wildcards, - /* partition_columns_in_data_file */true); - return std::make_shared( - partition_strategy, + insert_query->partition_by, metadata_snapshot, getStorageID().getNameForLogs(), std::unique_lock{rwlock, getLockTimeout(context)}, diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index 14a67dfa1634..b3646c138080 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -201,8 +201,6 @@ class StorageFile final : public IStorage FileRenamer file_renamer; bool was_renamed = false; bool distributed_processing = false; - NamesAndTypesList file_columns; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; }; class StorageFileSource : public SourceWithKeyCondition, WithContext @@ -216,7 +214,6 @@ class StorageFileSource : public SourceWithKeyCondition, WithContext std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - const NamesAndTypesList & hive_columns, const ContextPtr & context_, bool distributed_processing_ = false); @@ -304,7 +301,6 @@ class StorageFileSource : public SourceWithKeyCondition, WithContext NamesAndTypesList requested_virtual_columns; Block block_for_format; SerializationInfoByName serialization_hints; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; UInt64 max_block_size; diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index 1946fdc8c77b..ce5e3daf6828 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -13,8 +13,6 @@ #include #include -#include -#include namespace DB @@ -25,11 +23,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -namespace Setting -{ - extern const SettingsBool use_hive_partitioning; -} - StorageFileCluster::StorageFileCluster( const ContextPtr & context, const String & cluster_name_, @@ -65,24 +58,8 @@ StorageFileCluster::StorageFileCluster( storage_metadata.setColumns(columns_); } - auto & storage_columns = storage_metadata.columns; - - if (context->getSettingsRef()[Setting::use_hive_partitioning]) - { - const std::string sample_path = paths.empty() ? "" : paths.front(); - - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - sample_path, - columns_.empty(), - std::nullopt, - context - ); - } - storage_metadata.setConstraints(constraints_); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context, paths.empty() ? "" : paths[0])); setInMemoryMetadata(storage_metadata); } @@ -102,7 +79,7 @@ void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const Sto RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context, const size_t) const { - auto iterator = std::make_shared(paths, std::nullopt, predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context); + auto iterator = std::make_shared(paths, std::nullopt, predicate, getVirtualsList(), context); auto callback = std::make_shared([iter = std::move(iterator)](size_t) mutable -> String { return iter->next(); }); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; } diff --git a/src/Storages/StorageFileCluster.h b/src/Storages/StorageFileCluster.h index 2cbd82ba4000..a19790219af4 100644 --- a/src/Storages/StorageFileCluster.h +++ b/src/Storages/StorageFileCluster.h @@ -35,7 +35,6 @@ class StorageFileCluster : public IStorageCluster Strings paths; String filename; String format_name; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; }; } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 354fd8c3b2b5..ebbef0c8a3dc 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include @@ -27,7 +26,6 @@ #include #include -#include #include #include @@ -80,7 +78,6 @@ namespace Setting extern const SettingsUInt64 output_format_compression_zstd_window_log; extern const SettingsBool use_cache_for_count_from_files; extern const SettingsInt64 zstd_window_log_max; - extern const SettingsBool use_hive_partitioning; } namespace ErrorCodes @@ -187,23 +184,8 @@ IStorageURLBase::IStorageURLBase( storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); - auto & storage_columns = storage_metadata.columns; - - if (context_->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - getSampleURI(uri, context_), - columns_.empty(), - format_settings, - context_); - } - - /// If the `partition_strategy` argument is ever implemented for URL storage, this must be updated - file_columns = storage_columns.getAllPhysical(); - - auto virtual_columns_desc = VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns); + auto virtual_columns_desc = VirtualColumnUtils::getVirtualsForFileLikeStorage( + storage_metadata.columns, context_, getSampleURI(uri, context_), format_settings); if (!storage_metadata.getColumns().has("_headers")) { virtual_columns_desc.addEphemeral( @@ -249,13 +231,13 @@ namespace class StorageURLSource::DisclosedGlobIterator::Impl { public: - Impl(const String & uri_, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns, const ContextPtr & context) + Impl(const String & uri_, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { uris = parseRemoteDescription(uri_, 0, uri_.size(), ',', max_addresses); std::optional filter_dag; if (!uris.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, hive_columns); + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); if (filter_dag) { @@ -266,7 +248,7 @@ class StorageURLSource::DisclosedGlobIterator::Impl VirtualColumnUtils::buildSetsForDAG(*filter_dag, context); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns, hive_columns, context); + VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns, context); } } @@ -289,8 +271,8 @@ class StorageURLSource::DisclosedGlobIterator::Impl std::atomic_size_t index = 0; }; -StorageURLSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns, const ContextPtr & context) - : pimpl(std::make_shared(uri, max_addresses, predicate, virtual_columns, hive_columns, context)) {} +StorageURLSource::DisclosedGlobIterator::DisclosedGlobIterator(const String & uri, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) + : pimpl(std::make_shared(uri, max_addresses, predicate, virtual_columns, context)) {} String StorageURLSource::DisclosedGlobIterator::next() { @@ -346,7 +328,6 @@ StorageURLSource::StorageURLSource( , format_settings(format_settings_) , headers(getHeaders(headers_)) , need_only_count(need_only_count_) - , hive_partition_columns_to_read_from_file_path(info.hive_partition_columns_to_read_from_file_path) { /// Lazy initialization. We should not perform requests in constructor, because we need to do it in query pipeline. initialize = [=, this]() @@ -484,17 +465,6 @@ Chunk StorageURLSource::generate() .size = current_file_size, }, getContext()); - - // The order is important, hive partition columns must be added after virtual columns - if (!hive_partition_columns_to_read_from_file_path.empty()) - { - const auto path = curr_uri.getPath(); - HivePartitioningUtils::addPartitionColumnsToChunk( - chunk, - hive_partition_columns_to_read_from_file_path, - path); - } - chassert(dynamic_cast(read_buf.get())); if (need_headers_virtual_column) { @@ -733,7 +703,7 @@ class PartitionedStorageURLSink : public PartitionedSink { public: PartitionedStorageURLSink( - std::shared_ptr partition_strategy_, + const ASTPtr & partition_by, const String & uri_, const String & format_, const std::optional & format_settings_, @@ -743,7 +713,7 @@ class PartitionedStorageURLSink : public PartitionedSink const CompressionMethod compression_method_, const HTTPHeaderEntries & headers_, const String & http_method_) - : PartitionedSink(partition_strategy_, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_) , uri(uri_) , format(format_) , format_settings(format_settings_) @@ -758,8 +728,7 @@ class PartitionedStorageURLSink : public PartitionedSink SinkPtr createSinkForPartition(const String & partition_id) override { - std::string partition_path = partition_strategy->getPathForWrite(uri, partition_id); - + auto partition_path = PartitionedSink::replaceWildcards(uri, partition_id); context->getRemoteHostFilter().checkURL(Poco::URI(partition_path)); return std::make_shared( partition_path, format, format_settings, sample_block, context, timeouts, compression_method, headers, http_method); @@ -1172,12 +1141,7 @@ void IStorageURLBase::read( size_t num_streams) { auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); - auto read_from_format_info = prepareReadingFromFormat( - column_names, - storage_snapshot, - local_context, - supportsSubsetOfColumns(local_context), - PrepareReadingFromFormatHiveParams {file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap()}); + auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, local_context, supportsSubsetOfColumns(local_context)); bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef()[Setting::optimize_count_from_files]; @@ -1245,7 +1209,7 @@ void ReadFromURL::createIterator(const ActionsDAG::Node * predicate) else if (is_url_with_globs) { /// Iterate through disclosed globs and make a source for each file - auto glob_iterator = std::make_shared(storage->uri, max_addresses, predicate, storage->getVirtualsList(), info.hive_partition_columns_to_read_from_file_path, context); + auto glob_iterator = std::make_shared(storage->uri, max_addresses, predicate, storage->getVirtualsList(), context); /// check if we filtered out all the paths if (glob_iterator->size() == 0) @@ -1347,12 +1311,7 @@ void StorageURLWithFailover::read( size_t num_streams) { auto params = getReadURIParams(column_names, storage_snapshot, query_info, local_context, processed_stage, max_block_size); - auto read_from_format_info = prepareReadingFromFormat( - column_names, - storage_snapshot, - local_context, - supportsSubsetOfColumns(local_context), - PrepareReadingFromFormatHiveParams {file_columns, hive_partition_columns_to_read_from_file_path.getNameToTypeMap()}); + auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, local_context, supportsSubsetOfColumns(local_context)); bool need_only_count = (query_info.optimize_trivial_count || read_from_format_info.requested_columns.empty()) && local_context->getSettingsRef()[Setting::optimize_count_from_files]; @@ -1398,18 +1357,8 @@ SinkToStoragePtr IStorageURLBase::write(const ASTPtr & query, const StorageMetad if (is_partitioned_implementation) { - auto partition_strategy = PartitionStrategyFactory::get( - PartitionStrategyFactory::StrategyType::WILDCARD, - partition_by_ast, - metadata_snapshot->getColumns().getAll(), - context, - format_name, - urlWithGlobs(uri), - has_wildcards, - /* partition_columns_in_data_file */true); - return std::make_shared( - partition_strategy, + partition_by_ast, uri, format_name, format_settings, diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index cc274594ce8c..7473b87ed71a 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -103,8 +103,6 @@ class IStorageURLBase : public IStorage String http_method; /// For insert can choose Put instead of default Post. ASTPtr partition_by; bool distributed_processing; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; - NamesAndTypesList file_columns; virtual std::string getReadMethod() const; @@ -156,7 +154,7 @@ class StorageURLSource : public SourceWithKeyCondition, WithContext class DisclosedGlobIterator { public: - DisclosedGlobIterator(const String & uri_, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns, const ContextPtr & context); + DisclosedGlobIterator(const String & uri_, size_t max_addresses, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context); String next(); size_t size(); @@ -234,7 +232,6 @@ class StorageURLSource : public SourceWithKeyCondition, WithContext HTTPHeaderEntries headers; bool need_only_count; size_t total_rows_in_file = 0; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; Poco::Net::HTTPBasicCredentials credentials; diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index fff85117b37a..f9dec87247a0 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include @@ -40,7 +39,6 @@ namespace ErrorCodes namespace Setting { extern const SettingsUInt64 glob_expansion_max_elements; - extern const SettingsBool use_hive_partitioning; } StorageURLCluster::StorageURLCluster( @@ -82,20 +80,7 @@ StorageURLCluster::StorageURLCluster( storage_metadata.setColumns(columns_); } - auto & storage_columns = storage_metadata.columns; - - if (context->getSettingsRef()[Setting::use_hive_partitioning]) - { - HivePartitioningUtils::extractPartitionColumnsFromPathAndEnrichStorageColumns( - storage_columns, - hive_partition_columns_to_read_from_file_path, - getSampleURI(uri, context), - columns_.empty(), - std::nullopt, - context); - } - - auto virtual_columns_desc = VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns); + auto virtual_columns_desc = VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.columns, context, getSampleURI(uri, context)); if (!storage_metadata.getColumns().has("_headers")) { virtual_columns_desc.addEphemeral( @@ -132,7 +117,7 @@ void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageS RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context, size_t) const { auto iterator = std::make_shared( - uri, context->getSettingsRef()[Setting::glob_expansion_max_elements], predicate, getVirtualsList(), hive_partition_columns_to_read_from_file_path, context); + uri, context->getSettingsRef()[Setting::glob_expansion_max_elements], predicate, getVirtualsList(), context); auto callback = std::make_shared([iter = std::move(iterator)](size_t) mutable -> String { return iter->next(); }); return RemoteQueryExecutor::Extension{.task_iterator = std::move(callback)}; } diff --git a/src/Storages/StorageURLCluster.h b/src/Storages/StorageURLCluster.h index 8349f7594294..98dd1d3ece12 100644 --- a/src/Storages/StorageURLCluster.h +++ b/src/Storages/StorageURLCluster.h @@ -37,7 +37,6 @@ class StorageURLCluster : public IStorageCluster String uri; String format_name; - NamesAndTypesList hive_partition_columns_to_read_from_file_path; }; diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index d1402e8f3d71..4e8575d4142f 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -51,15 +51,25 @@ #include #include #include +#include #include #include #include #include -#include +#include namespace DB { +namespace Setting +{ + extern const SettingsBool use_hive_partitioning; +} + +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} namespace VirtualColumnUtils { @@ -138,16 +148,64 @@ NameSet getVirtualNamesForFileLikeStorage() return getCommonVirtualsForFileLikeStorage().getNameSet(); } -VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & storage_columns) +static auto makeExtractor() +{ + return KeyValuePairExtractorBuilder().withItemDelimiters({'/'}).withKeyValueDelimiter('=').buildWithReferenceMap(); +} + +HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const String & path) +{ + static auto extractor = makeExtractor(); + + HivePartitioningKeysAndValues key_values; + + // cutting the filename to prevent malformed filenames that contain key-value-pairs from being extracted + // not sure if we actually need to do that, but just in case. Plus, the previous regex impl took care of it + const auto last_slash_pos = path.find_last_of('/'); + + if (last_slash_pos == std::string::npos) + { + // nothing to extract, there is no path, just a filename + return key_values; + } + + std::string_view path_without_filename(path.data(), last_slash_pos); + + try + { + extractor.extract(path_without_filename, key_values); + } + catch (const extractKV::DuplicateKeyFoundException & ex) + { + throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {} with different values, only unique keys are allowed", path, ex.key); + } + + return key_values; +} + +VirtualColumnsDescription getVirtualsForFileLikeStorage( + ColumnsDescription & storage_columns, + const ContextPtr & context, + const std::string & path, + std::optional format_settings_, + bool is_data_lake) { VirtualColumnsDescription desc; - auto add_virtual = [&](const NameAndTypePair & pair) + auto add_virtual = [&](const NameAndTypePair & pair, bool prefer_virtual_column) /// By using prefer_virtual_column we define whether we will overwrite the storage column with the virtual one { const auto & name = pair.getNameInStorage(); const auto & type = pair.getTypeInStorage(); if (storage_columns.has(name)) { + if (!prefer_virtual_column) + return; + + if (storage_columns.size() == 1) + throw Exception(ErrorCodes::INCORRECT_DATA, "Cannot use hive partitioning for file {}: it contains only partition columns. Disable use_hive_partitioning setting to read/write this file", path); + auto local_type = storage_columns.get(name).type; + storage_columns.remove(name); + desc.addEphemeral(name, local_type, ""); return; } @@ -155,12 +213,33 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto }; for (const auto & item : getCommonVirtualsForFileLikeStorage()) - add_virtual(item); + add_virtual(item, false); + + if (context->getSettingsRef()[Setting::use_hive_partitioning] && !is_data_lake) + { + const auto map = parseHivePartitioningKeysAndValues(path); + auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); + + for (const auto & item : map) + { + const std::string key(item.first); + const std::string value(item.second); + + auto type = tryInferDataTypeByEscapingRule(value, format_settings, FormatSettings::EscapingRule::Raw); + + if (type == nullptr) + type = std::make_shared(); + if (type->canBeInsideLowCardinality()) + add_virtual({key, std::make_shared(type)}, true); + else + add_virtual({key, type}, true); + } + } return desc; } -static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx, const FormatSettings & format_settings, bool parse_hive_columns) +static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx, const FormatSettings & format_settings, bool use_hive_partitioning) { if (block.has("_path")) block.getByName("_path").column->assumeMutableRef().insert(path); @@ -177,9 +256,9 @@ static void addPathAndFileToVirtualColumns(Block & block, const String & path, s block.getByName("_file").column->assumeMutableRef().insert(file); } - if (parse_hive_columns) + if (use_hive_partitioning) { - const auto keys_and_values = HivePartitioningUtils::parseHivePartitioningKeysAndValues(path); + const auto keys_and_values = parseHivePartitioningKeysAndValues(path); for (const auto & [key, value] : keys_and_values) { if (const auto * column = block.findByName(key)) @@ -193,7 +272,7 @@ static void addPathAndFileToVirtualColumns(Block & block, const String & path, s block.getByName("_idx").column->assumeMutableRef().insert(idx); } -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns) +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns) { if (!predicate || virtual_columns.empty()) return {}; @@ -206,16 +285,11 @@ std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * pr block.insert({column.type->createColumn(), column.type, column.name}); } - for (const auto & column : hive_columns) - { - block.insert({column.type->createColumn(), column.type, column.name}); - } - block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); return splitFilterDagForAllowedInputs(predicate, &block); } -ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns, const ContextPtr & context) +ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { Block block; NameSet common_virtuals = getVirtualNamesForFileLikeStorage(); @@ -224,16 +298,10 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name)) block.insert({column.type->createColumn(), column.type, column.name}); } - - for (const auto & column : hive_columns) - { - block.insert({column.type->createColumn(), column.type, column.name}); - } - block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); for (size_t i = 0; i != paths.size(); ++i) - addPathAndFileToVirtualColumns(block, paths[i], i, getFormatSettings(context), /* parse_hive_columns */ !hive_columns.empty()); + addPathAndFileToVirtualColumns(block, paths[i], i, getFormatSettings(context), context->getSettingsRef()[Setting::use_hive_partitioning]); filterBlockWithExpression(actions, block); @@ -242,8 +310,12 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values, ContextPtr) + VirtualsForFileLikeStorage virtual_values, ContextPtr context) { + HivePartitioningKeysAndValues hive_map; + if (context->getSettingsRef()[Setting::use_hive_partitioning]) + hive_map = parseHivePartitioningKeysAndValues(virtual_values.path); + for (const auto & virtual_column : requested_virtual_columns) { if (virtual_column.name == "_path") @@ -277,6 +349,10 @@ void addRequestedFileLikeStorageVirtualsToChunk( else chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); } + else if (auto it = hive_map.find(virtual_column.getNameInStorage()); it != hive_map.end()) + { + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), convertFieldToType(Field(it->second), *virtual_column.type))->convertToFullColumnIfConst()); + } else if (virtual_column.name == "_etag") { if (virtual_values.etag) diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index 1ceb41cf9b4b..a5a1567627ef 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -6,6 +6,8 @@ #include #include #include +#include + namespace DB { @@ -70,16 +72,21 @@ auto extractSingleValueFromBlock(const Block & block, const String & name) } NameSet getVirtualNamesForFileLikeStorage(); -VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & storage_columns); +VirtualColumnsDescription getVirtualsForFileLikeStorage( + ColumnsDescription & storage_columns, + const ContextPtr & context, + const std::string & sample_path = "", + std::optional format_settings_ = std::nullopt, + bool is_data_lake = false); -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns = {}); +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); -ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns, const ContextPtr & context); +ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context); template -void filterByPathOrFile(std::vector & sources, const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const NamesAndTypesList & hive_columns, const ContextPtr & context) +void filterByPathOrFile(std::vector & sources, const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { - auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns, hive_columns, context); + auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns, context); const auto & indexes = typeid_cast(*indexes_column).getData(); if (indexes.size() == sources.size()) return; @@ -104,6 +111,10 @@ void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, VirtualsForFileLikeStorage virtual_values, ContextPtr context); +using HivePartitioningKeysAndValues = absl::flat_hash_map; + +HivePartitioningKeysAndValues parseHivePartitioningKeysAndValues(const String & path); + } } diff --git a/src/Storages/checkAndGetLiteralArgument.cpp b/src/Storages/checkAndGetLiteralArgument.cpp index bb21d2b564e7..8298e703cb2c 100644 --- a/src/Storages/checkAndGetLiteralArgument.cpp +++ b/src/Storages/checkAndGetLiteralArgument.cpp @@ -47,37 +47,10 @@ T checkAndGetLiteralArgument(const ASTLiteral & arg, const String & arg_name) return arg.value.safeGet(); } -template -std::optional tryGetLiteralArgument(const ASTPtr & arg, const String & arg_name) -{ - if (arg) - { - if (const auto * func = arg->as(); func && func->name == "_CAST") - { - return tryGetLiteralArgument(func->arguments->children.at(0), arg_name); - } - - if (arg->as()) - { - try - { - return checkAndGetLiteralArgument(*arg->as(), arg_name); - } - catch (...) - { - return std::nullopt; - } - } - } - - return std::nullopt; -} - template String checkAndGetLiteralArgument(const ASTPtr &, const String &); template UInt64 checkAndGetLiteralArgument(const ASTPtr &, const String &); template UInt8 checkAndGetLiteralArgument(const ASTPtr &, const String &); template bool checkAndGetLiteralArgument(const ASTPtr &, const String &); template String checkAndGetLiteralArgument(const ASTLiteral &, const String &); template UInt64 checkAndGetLiteralArgument(const ASTLiteral &, const String &); -template std::optional tryGetLiteralArgument(const ASTPtr & arg, const String & arg_name); } diff --git a/src/Storages/checkAndGetLiteralArgument.h b/src/Storages/checkAndGetLiteralArgument.h index 757ffa15c29a..5e988a5e63d8 100644 --- a/src/Storages/checkAndGetLiteralArgument.h +++ b/src/Storages/checkAndGetLiteralArgument.h @@ -14,7 +14,4 @@ T checkAndGetLiteralArgument(const ASTPtr & arg, const String & arg_name); template T checkAndGetLiteralArgument(const ASTLiteral & arg, const String & arg_name); -template -std::optional tryGetLiteralArgument(const ASTPtr & arg, const String & arg_name); - } diff --git a/src/Storages/prepareReadingFromFormat.cpp b/src/Storages/prepareReadingFromFormat.cpp index 82c40020116b..208003a33e46 100644 --- a/src/Storages/prepareReadingFromFormat.cpp +++ b/src/Storages/prepareReadingFromFormat.cpp @@ -18,11 +18,8 @@ ReadFromFormatInfo prepareReadingFromFormat( const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context, - bool supports_subset_of_columns, - const PrepareReadingFromFormatHiveParams & hive_parameters) + bool supports_subset_of_columns) { - const NamesAndTypesList & columns_in_data_file = - hive_parameters.file_columns.empty() ? storage_snapshot->metadata->getColumns().getAllPhysical() : hive_parameters.file_columns; ReadFromFormatInfo info; /// Collect requested virtual columns and remove them from requested columns. Strings columns_to_read; @@ -30,21 +27,16 @@ ReadFromFormatInfo prepareReadingFromFormat( { if (auto virtual_column = storage_snapshot->virtual_columns->tryGet(column_name)) info.requested_virtual_columns.emplace_back(std::move(*virtual_column)); - else if (auto it = hive_parameters.hive_partition_columns_to_read_from_file_path_map.find(column_name); it != hive_parameters.hive_partition_columns_to_read_from_file_path_map.end()) - info.hive_partition_columns_to_read_from_file_path.emplace_back(it->first, it->second); else columns_to_read.push_back(column_name); } - /// Create header for Source that will contain all requested columns including virtual and hive columns at the end + /// Create header for Source that will contain all requested columns including virtual columns at the end /// (because they will be added to the chunk after reading regular columns). info.source_header = storage_snapshot->getSampleBlockForColumns(columns_to_read); for (const auto & requested_virtual_column : info.requested_virtual_columns) info.source_header.insert({requested_virtual_column.type->createColumn(), requested_virtual_column.type, requested_virtual_column.name}); - for (const auto & column_from_file_path : info.hive_partition_columns_to_read_from_file_path) - info.source_header.insert({column_from_file_path.type->createColumn(), column_from_file_path.type, column_from_file_path.name}); - /// Set requested columns that should be read from data. info.requested_columns = storage_snapshot->getColumnsByNames(GetColumnsOptions(GetColumnsOptions::All).withSubcolumns(), columns_to_read); @@ -53,7 +45,7 @@ ReadFromFormatInfo prepareReadingFromFormat( /// If only virtual columns were requested, just read the smallest column. if (columns_to_read.empty()) { - columns_to_read.push_back(ExpressionActions::getSmallestColumn(columns_in_data_file).name); + columns_to_read.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name); } /// We need to replace all subcolumns with their nested columns (e.g `a.b`, `a.b.c`, `x.y` -> `a`, `x`), /// because most formats cannot extract subcolumns on their own. @@ -80,7 +72,7 @@ ReadFromFormatInfo prepareReadingFromFormat( /// Requested columns/subcolumns will be extracted after reading. else { - info.columns_description = storage_snapshot->getDescriptionForColumns(columns_in_data_file.getNames()); + info.columns_description = storage_snapshot->metadata->getColumns(); } /// Create header for InputFormat with columns that will be read from the data. diff --git a/src/Storages/prepareReadingFromFormat.h b/src/Storages/prepareReadingFromFormat.h index a5009eff89f2..02e42056d0cc 100644 --- a/src/Storages/prepareReadingFromFormat.h +++ b/src/Storages/prepareReadingFromFormat.h @@ -24,26 +24,10 @@ namespace DB /// Hints for the serialization of columns. /// For example can be retrieved from the destination table in INSERT SELECT query. SerializationInfoByName serialization_hints; - /// The list of hive partition columns. It shall be read from the path regardless if it is present in the file - NamesAndTypesList hive_partition_columns_to_read_from_file_path; - }; - - struct PrepareReadingFromFormatHiveParams - { - /// Columns which exist inside data file. - NamesAndTypesList file_columns; - /// Columns which are read from path to data file. - /// (Hive partition columns). - std::unordered_map hive_partition_columns_to_read_from_file_path_map; }; /// Get all needed information for reading from data in some input format. - ReadFromFormatInfo prepareReadingFromFormat( - const Strings & requested_columns, - const StorageSnapshotPtr & storage_snapshot, - const ContextPtr & context, - bool supports_subset_of_columns, - const PrepareReadingFromFormatHiveParams & hive_parameters = {}); + ReadFromFormatInfo prepareReadingFromFormat(const Strings & requested_columns, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context, bool supports_subset_of_columns); /// Returns the serialization hints from the insertion table (if it's set in the Context). SerializationInfoByName getSerializationHintsForFileLikeStorage(const StorageMetadataPtr & metadata_snapshot, const ContextPtr & context); diff --git a/src/Storages/tests/gtest_hive_partitioning_utils.cpp b/src/Storages/tests/gtest_virtual_column_utils.cpp similarity index 66% rename from src/Storages/tests/gtest_hive_partitioning_utils.cpp rename to src/Storages/tests/gtest_virtual_column_utils.cpp index 6ca13fc844ac..c46cf696b3d7 100644 --- a/src/Storages/tests/gtest_hive_partitioning_utils.cpp +++ b/src/Storages/tests/gtest_virtual_column_utils.cpp @@ -1,10 +1,10 @@ #include #include -#include +#include #include using namespace DB; -using namespace DB::HivePartitioningUtils; +using namespace DB::VirtualColumnUtils; TEST(VirtualColumnUtils, parseHivePartitioningKeysAndValuesEmptyValue) { @@ -63,3 +63,25 @@ TEST(VirtualColumnUtils, parseHivePartitioningKeysFilenameWithPairInIt) ASSERT_EQ(map.at("year"), "2022"); } + +TEST(VirtualColumnUtils, getVirtualsForFileLikeStorageEmptyValue) +{ + static std::string empty_value_path = "/output_data/year=2022/country=/data_0.parquet"; + + const auto & context_holder = getContext(); + + auto year_column = ColumnDescription("year", std::make_shared()); + auto country_column = ColumnDescription("country", std::make_shared()); + auto non_partition_column = ColumnDescription("non_partition", std::make_shared()); + + ColumnsDescription columns; + + columns.add(year_column); + columns.add(country_column); + columns.add(non_partition_column); + + auto res = getVirtualsForFileLikeStorage(columns, context_holder.context, empty_value_path); + + ASSERT_TRUE(res.has("year")); + ASSERT_TRUE(res.has("country")); +} diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index fe971bdbed7c..f24580608c5e 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -80,10 +80,6 @@ class ITableFunction : public std::enable_shared_from_this virtual bool canBeUsedToCreateTable() const { return true; } - // INSERT INTO TABLE FUNCTION ... PARTITION BY - // Set partition by expression so `ITableFunctionObjectStorage` can construct a proper representation - virtual void setPartitionBy(const ASTPtr &) {} - /// Create storage according to the query. StoragePtr execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false, bool is_insert_query = false) const; diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index b36bf1b63e3e..29a7c5c04de3 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -16,7 +16,6 @@ #include -#include #include #include #include @@ -25,6 +24,7 @@ #include #include #include +#include namespace DB @@ -190,7 +190,6 @@ StoragePtr TableFunctionObjectStorage:: StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, - partition_by, context); storage->startup(); @@ -208,8 +207,8 @@ StoragePtr TableFunctionObjectStorage:: /* format_settings */ std::nullopt, /* mode */ LoadingStrictnessLevel::CREATE, /* distributed_processing */ is_secondary_query, - /* partition_by */ partition_by, - /* is_table_function */true); + /* partition_by */ nullptr, + /* is_table_function */ true); storage->startup(); return storage; diff --git a/src/TableFunctions/TableFunctionObjectStorage.h b/src/TableFunctions/TableFunctionObjectStorage.h index b4090093dae9..d10781e8f0ec 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.h +++ b/src/TableFunctions/TableFunctionObjectStorage.h @@ -177,11 +177,6 @@ class TableFunctionObjectStorage : public ITableFunction Configuration().addStructureAndFormatToArgsIfNeeded(args, structure, format, context, /*with_structure=*/true); } - void setPartitionBy(const ASTPtr & partition_by_) override - { - partition_by = partition_by_; - } - protected: using ConfigurationPtr = StorageObjectStorage::ConfigurationPtr; @@ -206,7 +201,6 @@ class TableFunctionObjectStorage : public ITableFunction mutable ObjectStoragePtr object_storage; ColumnsDescription structure_hint; std::shared_ptr settings; - ASTPtr partition_by; std::vector skipAnalysisForArguments(const QueryTreeNodePtr & query_node_table_function, ContextPtr context) const override; }; diff --git a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp index 94cb4f3c52a5..6083d9a915f0 100644 --- a/src/TableFunctions/TableFunctionObjectStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionObjectStorageCluster.cpp @@ -45,7 +45,7 @@ StoragePtr TableFunctionObjectStorageCluster.parquet') AS _path, counter from test_hive_partition_strategy order by counter;") - - assert "cont/test_hive_partition_strategy/year=2020/country=Brazil/.parquet\t1\ncont/test_hive_partition_strategy/year=2021/country=Russia/.parquet\t2\ncont/test_hive_partition_strategy/year=2021/country=Russia/.parquet\t3\n" == res - - azure_query(node, "DROP TABLE IF EXISTS test_hive_partition_strategy") diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 0809e3f8c798..18085216da36 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -634,7 +634,7 @@ def test_wrong_s3_syntax(started_cluster): instance = started_cluster.instances["dummy"] # type: ClickHouseInstance expected_err_msg = "Code: 42" # NUMBER_OF_ARGUMENTS_DOESNT_MATCH - query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3('', '', '', '', '', '', '', '', '')" + query = "create table test_table_s3_syntax (id UInt32) ENGINE = S3('', '', '', '', '', '', '')" assert expected_err_msg in instance.query_and_get_error(query) expected_err_msg = "Code: 36" # BAD_ARGUMENTS diff --git a/tests/queries/0_stateless/01944_insert_partition_by.sql b/tests/queries/0_stateless/01944_insert_partition_by.sql index 03bbd17b8ce7..ac38fcee4905 100644 --- a/tests/queries/0_stateless/01944_insert_partition_by.sql +++ b/tests/queries/0_stateless/01944_insert_partition_by.sql @@ -7,3 +7,5 @@ INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.cs INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc}{abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc*abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/{_partition_id}', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } +INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } +INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'aa/bb'); -- { serverError CANNOT_PARSE_TEXT } diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index 85afdea228d2..25a04b4209d9 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -1,14 +1,14 @@ TESTING THE FILE HIVE PARTITIONING -Elizabeth last -Elizabeth Frank -Elizabeth Moreno -Elizabeth Guzman -Elizabeth Stephens -Elizabeth Franklin -Elizabeth Gibson -Elizabeth Greer -Elizabeth Delgado -Elizabeth Cross + last Elizabeth +Frank Elizabeth +Moreno Elizabeth +Guzman Elizabeth +Stephens Elizabeth +Franklin Elizabeth +Gibson Elizabeth +Greer Elizabeth +Delgado Elizabeth +Cross Elizabeth first last Elizabeth Jorge Frank Elizabeth Hunter Moreno Elizabeth @@ -19,19 +19,19 @@ Stanley Gibson Elizabeth Eugenia Greer Elizabeth Jeffery Delgado Elizabeth Clara Cross Elizabeth -Elizabeth last -Elizabeth Frank -Elizabeth Moreno -Elizabeth Guzman -Elizabeth Stephens -Elizabeth Franklin -Elizabeth Gibson -Elizabeth Greer -Elizabeth Delgado -Elizabeth Cross + last Elizabeth +Frank Elizabeth +Moreno Elizabeth +Guzman Elizabeth +Stephens Elizabeth +Franklin Elizabeth +Gibson Elizabeth +Greer Elizabeth +Delgado Elizabeth +Cross Elizabeth 42 2020-01-01 [1,2,3] 42.42 -Array(Int64) Float64 +Array(Int64) LowCardinality(Float64) 101 2071 2071 @@ -39,16 +39,16 @@ b 1 1 TESTING THE URL PARTITIONING -Elizabeth last -Elizabeth Frank -Elizabeth Moreno -Elizabeth Guzman -Elizabeth Stephens -Elizabeth Franklin -Elizabeth Gibson -Elizabeth Greer -Elizabeth Delgado -Elizabeth Cross + last Elizabeth +Frank Elizabeth +Moreno Elizabeth +Guzman Elizabeth +Stephens Elizabeth +Franklin Elizabeth +Gibson Elizabeth +Greer Elizabeth +Delgado Elizabeth +Cross Elizabeth first last Elizabeth Jorge Frank Elizabeth Hunter Moreno Elizabeth @@ -61,16 +61,16 @@ Jeffery Delgado Elizabeth Clara Cross Elizabeth 1 TESTING THE S3 PARTITIONING -Elizabeth last -Elizabeth Frank -Elizabeth Moreno -Elizabeth Guzman -Elizabeth Stephens -Elizabeth Franklin -Elizabeth Gibson -Elizabeth Greer -Elizabeth Delgado -Elizabeth Cross + last Elizabeth +Frank Elizabeth +Moreno Elizabeth +Guzman Elizabeth +Stephens Elizabeth +Franklin Elizabeth +Gibson Elizabeth +Greer Elizabeth +Delgado Elizabeth +Cross Elizabeth first last Elizabeth Jorge Frank Elizabeth Hunter Moreno Elizabeth @@ -81,38 +81,38 @@ Stanley Gibson Elizabeth Eugenia Greer Elizabeth Jeffery Delgado Elizabeth Clara Cross Elizabeth -Elizabeth last -Elizabeth Frank -Elizabeth Moreno -Elizabeth Guzman -Elizabeth Stephens -Elizabeth Franklin -Elizabeth Gibson -Elizabeth Greer -Elizabeth Delgado -Elizabeth Cross + last Elizabeth +Frank Elizabeth +Moreno Elizabeth +Guzman Elizabeth +Stephens Elizabeth +Franklin Elizabeth +Gibson Elizabeth +Greer Elizabeth +Delgado Elizabeth +Cross Elizabeth test/hive_partitioning/column0=Arthur/column1=/sample.parquet test/hive_partitioning/column0=Arthur/column1=ABC/sample.parquet test/hive_partitioning/column0=Arthur/column1=/sample.parquet OK TESTING THE S3CLUSTER PARTITIONING -Elizabeth last -Elizabeth Frank -Elizabeth Moreno -Elizabeth Guzman -Elizabeth Stephens -Elizabeth Franklin -Elizabeth Gibson -Elizabeth Greer -Elizabeth Delgado -Elizabeth Cross -Elizabeth last -Elizabeth Frank -Elizabeth Moreno -Elizabeth Guzman -Elizabeth Stephens -Elizabeth Franklin -Elizabeth Gibson -Elizabeth Greer -Elizabeth Delgado -Elizabeth Cross + last Elizabeth +Frank Elizabeth +Moreno Elizabeth +Guzman Elizabeth +Stephens Elizabeth +Franklin Elizabeth +Gibson Elizabeth +Greer Elizabeth +Delgado Elizabeth +Cross Elizabeth + last Elizabeth +Frank Elizabeth +Moreno Elizabeth +Guzman Elizabeth +Stephens Elizabeth +Franklin Elizabeth +Gibson Elizabeth +Greer Elizabeth +Delgado Elizabeth +Cross Elizabeth diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 4a30a5d03083..e22583420a83 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -11,10 +11,10 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" $CLICKHOUSE_LOCAL -q """ set use_hive_partitioning = 1; -SELECT column0, column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -SELECT * FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; -SELECT * FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10; +SELECT *, non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10; SELECT number, date FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') LIMIT 1; SELECT array, float FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1; @@ -32,7 +32,7 @@ SELECT a FROM file('$CURDIR/data_hive/partitioning/a=b/a=b/sample.parquet') LIMI $CLICKHOUSE_LOCAL -q """ set use_hive_partitioning = 1; -SELECT * FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth1/sample.parquet') LIMIT 10; +SELECT *, column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth1/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "INCORRECT_DATA" $CLICKHOUSE_LOCAL -q """ @@ -48,9 +48,9 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'" $CLICKHOUSE_LOCAL -q """ set use_hive_partitioning = 1; -SELECT * FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -SELECT * FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;""" +SELECT *, non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;""" $CLICKHOUSE_LOCAL -q """ set use_hive_partitioning = 0; @@ -65,10 +65,10 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'" $CLICKHOUSE_CLIENT -q """ set use_hive_partitioning = 1; -SELECT * FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -SELECT * FROM s3('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; -SELECT * FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10; +SELECT *, non_existing_column FROM s3('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10; SELECT _path FROM s3('http://localhost:11111/test/hive_partitioning/column0=Arthur/**.parquet') order by _path; SELECT _path FROM s3('http://localhost:11111/test/hive_partitioning/column0=Arthur/**.parquet') where column1 = ''; @@ -85,7 +85,7 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3CLUSTER PARTITIONING'" $CLICKHOUSE_CLIENT -q """ set use_hive_partitioning = 1; -SELECT * FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -SELECT * FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10; +SELECT *, column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 10; """ diff --git a/tests/queries/0_stateless/03363_hive_style_partition.reference b/tests/queries/0_stateless/03363_hive_style_partition.reference deleted file mode 100644 index a340a0866d52..000000000000 --- a/tests/queries/0_stateless/03363_hive_style_partition.reference +++ /dev/null @@ -1,36 +0,0 @@ -test/t_03363_parquet/year=2022/country=USA/.parquet 1 -test/t_03363_parquet/year=2022/country=Canada/.parquet 2 -test/t_03363_parquet/year=2023/country=USA/.parquet 3 -test/t_03363_parquet/year=2023/country=Mexico/.parquet 4 -test/t_03363_parquet/year=2024/country=France/.parquet 5 -test/t_03363_parquet/year=2024/country=Germany/.parquet 6 -test/t_03363_parquet/year=2024/country=Germany/.parquet 7 -test/t_03363_parquet/year=1999/country=Brazil/.parquet 8 -test/t_03363_parquet/year=2100/country=Japan/.parquet 9 -test/t_03363_parquet/year=2024/country=CN/.parquet 10 -test/t_03363_parquet/year=2025/country=/.parquet 11 -test/t_03363_csv/year=2022/country=USA/.csv 1 -test/t_03363_csv/year=2022/country=Canada/.csv 2 -test/t_03363_csv/year=2023/country=USA/.csv 3 -test/t_03363_csv/year=2023/country=Mexico/.csv 4 -test/t_03363_csv/year=2024/country=France/.csv 5 -test/t_03363_csv/year=2024/country=Germany/.csv 6 -test/t_03363_csv/year=2024/country=Germany/.csv 7 -test/t_03363_csv/year=1999/country=Brazil/.csv 8 -test/t_03363_csv/year=2100/country=Japan/.csv 9 -test/t_03363_csv/year=2024/country=CN/.csv 10 -test/t_03363_csv/year=2025/country=/.csv 11 -test/t_03363_function/year=2022/country=USA/.parquet 1 -test/t_03363_function/year=2022/country=Canada/.parquet 2 -test/t_03363_function/year=2023/country=USA/.parquet 3 -test/t_03363_function/year=2023/country=Mexico/.parquet 4 -test/t_03363_function/year=2024/country=France/.parquet 5 -test/t_03363_function/year=2024/country=Germany/.parquet 6 -test/t_03363_function/year=2024/country=Germany/.parquet 7 -test/t_03363_function/year=1999/country=Brazil/.parquet 8 -test/t_03363_function/year=2100/country=Japan/.parquet 9 -test/t_03363_function/year=2024/country=CN/.parquet 10 -test/t_03363_function/year=2025/country=/.parquet 11 -1 -3 -USA 2022 1 diff --git a/tests/queries/0_stateless/03363_hive_style_partition.sql b/tests/queries/0_stateless/03363_hive_style_partition.sql deleted file mode 100644 index e701fdba0681..000000000000 --- a/tests/queries/0_stateless/03363_hive_style_partition.sql +++ /dev/null @@ -1,124 +0,0 @@ --- Tags: no-parallel, no-fasttest, no-random-settings - -DROP TABLE IF EXISTS t_03363_parquet, t_03363_csv; - -CREATE TABLE t_03363_parquet (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partition_strategy='hive') -PARTITION BY (year, country); - -INSERT INTO t_03363_parquet VALUES - (2022, 'USA', 1), - (2022, 'Canada', 2), - (2023, 'USA', 3), - (2023, 'Mexico', 4), - (2024, 'France', 5), - (2024, 'Germany', 6), - (2024, 'Germany', 7), - (1999, 'Brazil', 8), - (2100, 'Japan', 9), - (2024, 'CN', 10), - (2025, '', 11); - --- distinct because minio isn't cleaned up -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from t_03363_parquet order by counter; - --- CSV test -CREATE TABLE t_03363_csv (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_csv', format = CSV, partition_strategy='hive') -PARTITION BY (year, country); - -INSERT INTO t_03363_csv VALUES - (2022, 'USA', 1), - (2022, 'Canada', 2), - (2023, 'USA', 3), - (2023, 'Mexico', 4), - (2024, 'France', 5), - (2024, 'Germany', 6), - (2024, 'Germany', 7), - (1999, 'Brazil', 8), - (2100, 'Japan', 9), - (2024, 'CN', 10), - (2025, '', 11); - -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.csv', '/.csv') AS _path, counter from t_03363_csv order by counter; - --- s3 table function -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function', format=Parquet, partition_strategy='hive') PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet; -select distinct on (counter) replaceRegexpAll(_path, '/[0-9]+\\.parquet', '/.parquet') AS _path, counter from s3(s3_conn, filename='t_03363_function/**.parquet') order by counter; - --- create a "bucket" with mixed partitioning schemes so we can simulate a malformed storage -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_mixed_partitioning', format=Parquet, partition_strategy='hive') PARTITION BY (year) select 1 as id, 2025 as year; -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_mixed_partitioning', format=Parquet, partition_strategy='hive') PARTITION BY (country) select 1 as id, 'Brazil' as country; - --- Depends on the above two inserts, should throw exception because it could not find the hive partition columns it was looking for --- The format is null because one of the files contains the requested columns and might return the data before we throw the exception -select * from s3(s3_conn, filename='t_03363_mixed_partitioning/**.parquet') Format null; -- {serverError INCORRECT_DATA} - --- Depends on the above two inserts, should throw exception because it could not find the hive partition columns it was looking for --- The format is null because one of the files contains the requested columns and might return the data before we throw the exception -CREATE TABLE t_03363_mixed_partitioning (id Int32, year UInt16) ENGINE=S3(s3_conn, filename='t_03363_mixed_partitioning', format=Parquet, partition_strategy='hive') PARTITION BY (year); -SELECT * FROM t_03363_mixed_partitioning Format null; -- {serverError INCORRECT_DATA} - --- should output 1 because partition columns are not written down to the file by default when hive style is being used -select num_columns from s3(s3_conn, filename='t_03363_function/**.parquet', format=ParquetMetadata) limit 1; - -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_function_write_down_partition_columns', format=Parquet, partition_strategy='hive', partition_columns_in_data_file=1) PARTITION BY (year, country) SELECT country, year, counter FROM t_03363_parquet; -select num_columns from s3(s3_conn, filename='t_03363_function_write_down_partition_columns/**.parquet', format=ParquetMetadata) limit 1; - --- hive partitioning = 0 so we know it is not reading columns from the path -select * from s3(s3_conn, filename='t_03363_function_write_down_partition_columns/**.parquet', format=Parquet) order by counter limit 1 SETTINGS use_hive_partitioning=0; - --- only partition columns -INSERT INTO FUNCTION s3(s3_conn, filename='t_03363_parquet', format=Parquet, partition_strategy='hive') PARTITION BY (year, country) SELECT 2020 as year, 'Brazil' as country; -- {serverError INCORRECT_DATA}; - --- hive with partition id placeholder -CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet/{_partition_id}', format = Parquet, partition_strategy='hive') -PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; - --- unknown partitioning style -CREATE TABLE t_03363_s3_sink (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet', format = Parquet, partition_strategy='abc') -PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS}; - --- hive partition strategy can't be used without partition by clause -CREATE TABLE t_03363_s3_err (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet', partition_strategy='hive', format=Parquet); -- {serverError BAD_ARGUMENTS} - --- hive partition strategy can't be used without partition by clause -INSERT INTO FUNCTION s3(s3_conn, filename = 't_03363_parquet', partition_strategy='hive', format=Parquet) VALUES 1; -- {serverError BAD_ARGUMENTS} - --- hive partition strategy can't be used with globbed path -CREATE TABLE t_03363_s3_err (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet/**', partition_strategy='hive', format=Parquet); -- {serverError BAD_ARGUMENTS} - --- hive partition strategy can't be used with globbed path -INSERT INTO FUNCTION s3(s3_conn, filename = 't_03363_parquet/**', partition_strategy='hive', format=Parquet) VALUES 1; -- {serverError BAD_ARGUMENTS} - --- partition_columns_in_data_file can't be zero for non hive -CREATE TABLE t_03363_s3_err (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet{_partition_id}', partition_strategy='wildcard', format=Parquet, partition_columns_in_data_file=0) -PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS} - --- partition_columns_in_data_file can't be zero for non hive strategy -CREATE TABLE t_03363_s3_err (year UInt16, country String, counter UInt8) -ENGINE = S3(s3_conn, filename = 't_03363_parquet', format=Parquet, partition_columns_in_data_file=0) PARTITION BY (year, country); -- {serverError BAD_ARGUMENTS} - --- hive partition strategy can't be set in select statement? -select * from s3(s3_conn, filename='t_03363_function_write_down_partition_columns/**.parquet', format=Parquet, partition_strategy='hive'); -- {serverError BAD_ARGUMENTS} - --- do not support expressions in hive partitioning -CREATE TABLE t_invalid_expression (year UInt16, country String, counter UInt8) - ENGINE = S3(s3_conn, filename = 'invalid', format = Parquet, partition_strategy='hive') - PARTITION BY toString(year); -- {serverError BAD_ARGUMENTS} - --- floating types not supported -CREATE TABLE t_invalid_expression (year UInt16, country String, counter Float64) - ENGINE = S3(s3_conn, filename = 'invalid', format = Parquet, partition_strategy='hive') - PARTITION BY counter; -- {serverError BAD_ARGUMENTS} - --- Data lake like engines do not support the `partition_strategy` argument -CREATE TABLE t_03363_iceberg ENGINE=IcebergS3(s3_conn, filename = 'iceberg_data/default/t_iceberg/', format='parquet', url = 'http://minio1:9001/bucket/', partition_strategy='WILDCARD'); -- {serverError BAD_ARGUMENTS} -CREATE TABLE t_03363_iceberg ENGINE=IcebergS3(s3_conn, filename = 'iceberg_data/default/t_iceberg/', format='parquet', url = 'http://minio1:9001/bucket/', partition_strategy='HIVE'); -- {serverError BAD_ARGUMENTS} - -DROP TABLE IF EXISTS t_03363_parquet, t_03363_csv; diff --git a/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.reference b/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.reference deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql b/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql deleted file mode 100644 index 4b2ee57c7630..000000000000 --- a/tests/queries/0_stateless/03364_s3_globbed_path_in_bucket_portion.sql +++ /dev/null @@ -1,30 +0,0 @@ --- Tags: no-fasttest --- virtual hosted style -create table s3_03364 (id UInt32) engine=S3('http://{_partition_id}.s3.region.amazonaws.com/key'); -- {serverError BAD_ARGUMENTS} -create table s3_03364 (id UInt32) engine=S3('http://{_partition_id}something.s3.region.amazonaws.com/key'); -- {serverError BAD_ARGUMENTS} - -select * from s3('http://{_partition_id}.s3.region.amazonaws.com/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} -select * from s3('http://{_partition_id}something.s3.region.amazonaws.com/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} - -insert into table function s3('http://{_partition_id}.s3.region.amazonaws.com/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} -insert into table function s3('http://{_partition_id}something.s3.region.amazonaws.com/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} - --- path style -create table s3_03364 (id UInt32) engine=S3('http://s3.region.amazonaws.com/{_partition_id}'); -- {serverError BAD_ARGUMENTS} -create table s3_03364 (id UInt32) engine=S3('http://s3.region.amazonaws.com/{_partition_id}/key'); -- {serverError BAD_ARGUMENTS} - -select * from s3('http://s3.region.amazonaws.com/{_partition_id}', 'Parquet'); -- {serverError BAD_ARGUMENTS} -select * from s3('http://s3.region.amazonaws.com/{_partition_id}/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} - -insert into table function s3('http://s3.region.amazonaws.com/{_partition_id}', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} -insert into table function s3('http://s3.region.amazonaws.com/{_partition_id}/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} - --- aws private link style -create table s3_03364 (id UInt32) engine=S3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}'); -- {serverError BAD_ARGUMENTS} -create table s3_03364 (id UInt32) engine=S3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}/key'); -- {serverError BAD_ARGUMENTS} - -select * from s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}', 'Parquet'); -- {serverError BAD_ARGUMENTS} -select * from s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}/key', 'Parquet'); -- {serverError BAD_ARGUMENTS} - -insert into table function s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} -insert into table function s3('http://bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/{_partition_id}/key', 'NOSIGN', 'Parquet') select * from numbers(5); -- {serverError BAD_ARGUMENTS} diff --git a/tests/queries/0_stateless/03528_s3_insert_partition_by_whitespaces.sql b/tests/queries/0_stateless/03528_s3_insert_partition_by_whitespaces.sql index ffb0ad860df3..37cd23b00351 100644 --- a/tests/queries/0_stateless/03528_s3_insert_partition_by_whitespaces.sql +++ b/tests/queries/0_stateless/03528_s3_insert_partition_by_whitespaces.sql @@ -4,25 +4,26 @@ INSERT INTO FUNCTION s3( s3_conn, - filename = currentDatabase() || '/{_partition_id}/test.parquet', + filename = currentDatabase() || '/test1.parquet', format = Parquet ) - PARTITION BY 1 + PARTITION BY rand() % 10 SELECT * FROM system.numbers LIMIT 10; -SELECT * FROM s3(s3_conn, filename = currentDatabase() || '/1/test.parquet'); +SELECT * FROM s3(s3_conn, filename = currentDatabase() || '/test1.parquet'); + INSERT INTO FUNCTION s3( s3_conn, - filename = currentDatabase() || '/{_partition_id}/test.parquet', + filename = currentDatabase() || '/test2.parquet', format = Parquet - ) PARTITION BY 2 SELECT + ) PARTITION BY rand() % 10 SELECT * FROM system.numbers LIMIT 10; -SELECT * FROM s3(s3_conn, filename = currentDatabase() || '/2/test.parquet'); +SELECT * FROM s3(s3_conn, filename = currentDatabase() || '/test2.parquet'); diff --git a/tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.reference b/tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.reference deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.sql b/tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.sql deleted file mode 100644 index 7d089bd6181a..000000000000 --- a/tests/queries/0_stateless/03547_s3_partition_by_require_partition_wildcard.sql +++ /dev/null @@ -1,3 +0,0 @@ --- Tags: no-parallel, no-fasttest, no-random-settings - -CREATE TABLE s3_03547 (id UInt64) ENGINE=S3(s3_conn, format=Parquet) PARTITION BY id; -- {serverError BAD_ARGUMENTS}