-
Notifications
You must be signed in to change notification settings - Fork 11
use parquet metadata cache for parquetmetadata format as well #636
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
85b268e
cb8d032
760bc30
79da2cf
a0f7b08
8abf0b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| #include <Processors/Formats/Impl/ParquetFileMetaDataCache.h> | ||
|
|
||
| #ifdef USE_PARQUET | ||
|
|
||
| namespace DB | ||
| { | ||
|
|
||
| ParquetFileMetaDataCache::ParquetFileMetaDataCache() | ||
| : CacheBase<String, parquet::FileMetaData>(0) | ||
| {} | ||
|
|
||
| ParquetFileMetaDataCache * ParquetFileMetaDataCache::instance() | ||
| { | ||
| static ParquetFileMetaDataCache instance; | ||
| return &instance; | ||
| } | ||
|
|
||
| } | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| #pragma once | ||
|
|
||
| #include "config.h" | ||
|
|
||
| #if USE_PARQUET | ||
|
|
||
| namespace parquet | ||
| { | ||
|
|
||
| class FileMetaData; | ||
|
|
||
| } | ||
|
|
||
| #include <Common/CacheBase.h> | ||
|
|
||
| namespace DB | ||
| { | ||
|
|
||
| class ParquetFileMetaDataCache : public CacheBase<String, parquet::FileMetaData> | ||
| { | ||
| public: | ||
| static ParquetFileMetaDataCache * instance(); | ||
|
|
||
| private: | ||
| ParquetFileMetaDataCache(); | ||
| }; | ||
|
|
||
| } | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,8 +22,17 @@ | |
| #include <parquet/statistics.h> | ||
| #include "ArrowBufferedStreams.h" | ||
| #include <DataTypes/NestedUtils.h> | ||
| #include <Core/Settings.h> | ||
| #include <Common/ProfileEvents.h> | ||
| #include <Processors/Formats/Impl/ParquetFileMetaDataCache.h> | ||
|
|
||
|
|
||
| namespace ProfileEvents | ||
| { | ||
| extern const Event ParquetMetaDataCacheHits; | ||
| extern const Event ParquetMetaDataCacheMisses; | ||
| } | ||
|
|
||
| namespace DB | ||
| { | ||
|
|
||
|
|
@@ -32,6 +41,11 @@ namespace ErrorCodes | |
| extern const int BAD_ARGUMENTS; | ||
| } | ||
|
|
||
| namespace Setting | ||
| { | ||
| extern const SettingsBool input_format_parquet_use_metadata_cache; | ||
| } | ||
|
|
||
| static NamesAndTypesList getHeaderForParquetMetadata() | ||
| { | ||
| NamesAndTypesList names_and_types{ | ||
|
|
@@ -129,10 +143,35 @@ void checkHeader(const Block & header) | |
| static std::shared_ptr<parquet::FileMetaData> getFileMetadata( | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are two implementations of
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do not see a lot of duplication (as code is really a bit different). But it depends on whether we are planning to keep this here or move it to upstream later. If we are keeping it here, I would avoid unnecessary refactoring in this case -- this code does not look like it will change a lot, but changing code in more places may cause additional conflicts |
||
| ReadBuffer & in, | ||
| const FormatSettings & format_settings, | ||
| std::atomic<int> & is_stopped) | ||
| std::atomic<int> & is_stopped, | ||
| ParquetMetadataInputFormat::Cache metadata_cache) | ||
| { | ||
| auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); | ||
| return parquet::ReadMetaData(arrow_file); | ||
| // in-memory cache is not implemented for local file operations, only for remote files | ||
| // there is a chance the user sets `input_format_parquet_use_metadata_cache=1` for a local file operation | ||
| // and the cache_key won't be set. Therefore, we also need to check for metadata_cache.key | ||
| if (!metadata_cache.use_cache || metadata_cache.key.empty()) | ||
| { | ||
| auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); | ||
| return parquet::ReadMetaData(arrow_file); | ||
| } | ||
|
|
||
| auto [parquet_file_metadata, loaded] = ParquetFileMetaDataCache::instance()->getOrSet( | ||
| metadata_cache.key, | ||
| [&]() | ||
| { | ||
| auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true); | ||
| return parquet::ReadMetaData(arrow_file); | ||
| } | ||
| ); | ||
|
|
||
| if (loaded) | ||
| ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheMisses); | ||
| else | ||
| ProfileEvents::increment(ProfileEvents::ParquetMetaDataCacheHits); | ||
|
|
||
| return parquet_file_metadata; | ||
|
|
||
|
|
||
| } | ||
|
|
||
| ParquetMetadataInputFormat::ParquetMetadataInputFormat(ReadBuffer & in_, Block header_, const FormatSettings & format_settings_) | ||
|
|
@@ -147,7 +186,7 @@ Chunk ParquetMetadataInputFormat::read() | |
| if (done) | ||
| return res; | ||
|
|
||
| auto metadata = getFileMetadata(*in, format_settings, is_stopped); | ||
| auto metadata = getFileMetadata(*in, format_settings, is_stopped, metadata_cache); | ||
|
|
||
| const auto & header = getPort().getHeader(); | ||
| auto names_and_types = getHeaderForParquetMetadata(); | ||
|
|
@@ -486,6 +525,12 @@ void ParquetMetadataInputFormat::resetParser() | |
| done = false; | ||
| } | ||
|
|
||
| void ParquetMetadataInputFormat::setStorageRelatedUniqueKey(const Settings & settings, const String & key_) | ||
| { | ||
| metadata_cache.key = key_; | ||
| metadata_cache.use_cache = settings[Setting::input_format_parquet_use_metadata_cache]; | ||
| } | ||
|
|
||
| ParquetMetadataSchemaReader::ParquetMetadataSchemaReader(ReadBuffer & in_) | ||
| : ISchemaReader(in_) | ||
| { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,5 @@ | ||
| 10 | ||
| 10 | ||
| 10 | ||
| 10 | ||
| 10 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks too silly
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this singleton here? Can't the cache be just a member somewhere?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is used by two classes:
ParquetBlockInputFormatandParquetMetadataInputFormat