-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-34785: [C++][Parquet] Add bloom filter write support #37400
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
f1c6dc0
6ebd6da
70c9267
48350d8
d2a659e
41236d8
8afba81
96c6691
c131341
220b58e
ad96c48
b756241
f43505b
3497f4a
fecd0f0
29cc1c1
ffbb491
4d63428
f689716
8e9cb16
7fd47be
7c4ff4e
feccee9
90245e7
d924e36
0340193
b78eed0
23828e1
6fd57dc
86a8760
f8e724c
447badf
0c1065c
5225e08
a779982
4195406
ed267bd
478889d
2992072
4852261
add1afd
f627e30
bb8d4a5
ad0f1af
e1de5bc
430742a
00f176e
17f4951
de27ce4
259f15b
057b542
34a4c28
70e3508
c587568
2223423
22030db
e9c550a
23fb3fa
d892819
ef3291d
7aee7dd
c5b1fb1
0898466
71f5906
d57ceea
26c2d07
d422ffa
e6bc6e1
dfaf0e8
0bafe78
ce30ebc
8286783
b079acb
3a5a491
cccb9a8
3cf9425
aac454e
83999cd
fa0c9b1
351da07
12364d0
8dec902
40c9079
d32c40b
a662563
61b6dff
18f1a47
2bfa278
f03a327
4aeff8b
0f50418
789d130
6dc8d88
0940cd8
a126e03
e560a28
bf5e859
0638b11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,6 +66,8 @@ | |
| #include "parquet/arrow/schema.h" | ||
| #include "parquet/arrow/test_util.h" | ||
| #include "parquet/arrow/writer.h" | ||
| #include "parquet/bloom_filter.h" | ||
| #include "parquet/bloom_filter_reader.h" | ||
|
wgtmac marked this conversation as resolved.
Outdated
|
||
| #include "parquet/column_writer.h" | ||
| #include "parquet/file_writer.h" | ||
| #include "parquet/page_index.h" | ||
|
|
@@ -5256,7 +5258,7 @@ auto encode_double = [](double value) { | |
|
|
||
| } // namespace | ||
|
|
||
| class ParquetPageIndexRoundTripTest : public ::testing::Test { | ||
| class ParquetIndexRoundTripTest { | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| public: | ||
| void WriteFile(const std::shared_ptr<WriterProperties>& writer_properties, | ||
| const std::shared_ptr<::arrow::Table>& table) { | ||
|
|
@@ -5280,10 +5282,17 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test { | |
| ASSERT_OK_AND_ASSIGN(buffer_, sink->Finish()); | ||
| } | ||
|
|
||
| protected: | ||
| std::shared_ptr<Buffer> buffer_; | ||
| }; | ||
|
|
||
| class ParquetPageIndexRoundTripTest : public ::testing::Test, | ||
| public ParquetIndexRoundTripTest { | ||
| public: | ||
| void ReadPageIndexes(int expect_num_row_groups, int expect_num_pages, | ||
| const std::set<int>& expect_columns_without_index = {}) { | ||
| auto read_properties = default_arrow_reader_properties(); | ||
| auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer_)); | ||
| auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(this->buffer_)); | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
|
|
||
| auto metadata = reader->metadata(); | ||
| ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups()); | ||
|
|
@@ -5348,7 +5357,6 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test { | |
| } | ||
|
|
||
| protected: | ||
| std::shared_ptr<Buffer> buffer_; | ||
| std::vector<ColumnIndexObject> column_indexes_; | ||
| }; | ||
|
|
||
|
|
@@ -5584,5 +5592,104 @@ TEST_F(ParquetPageIndexRoundTripTest, EnablePerColumn) { | |
| /*null_counts=*/{0}})); | ||
| } | ||
|
|
||
| class ParquetBloomFilterRoundTripTest : public ::testing::Test, | ||
|
wgtmac marked this conversation as resolved.
Outdated
|
||
| public ParquetIndexRoundTripTest { | ||
| public: | ||
| void ReadBloomFilters(int expect_num_row_groups, | ||
| const std::set<int>& expect_columns_without_filter = {}) { | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| auto read_properties = default_arrow_reader_properties(); | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| auto reader = ParquetFileReader::Open(std::make_shared<BufferReader>(buffer_)); | ||
|
|
||
| auto metadata = reader->metadata(); | ||
| ASSERT_EQ(expect_num_row_groups, metadata->num_row_groups()); | ||
|
|
||
| auto& bloom_filter_reader = reader->GetBloomFilterReader(); | ||
|
|
||
| for (int rg = 0; rg < metadata->num_row_groups(); ++rg) { | ||
| auto row_group_reader = bloom_filter_reader.RowGroup(rg); | ||
| ASSERT_NE(row_group_reader, nullptr); | ||
|
|
||
| for (int col = 0; col < metadata->num_columns(); ++col) { | ||
| bool expect_no_bloom_filter = expect_columns_without_filter.find(col) != | ||
| expect_columns_without_filter.cend(); | ||
|
|
||
| auto bloom_filter = row_group_reader->GetColumnBloomFilter(col); | ||
| if (expect_no_bloom_filter) { | ||
| ASSERT_EQ(bloom_filter, nullptr); | ||
| } else { | ||
| bloom_filters_.push_back(std::move(bloom_filter)); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about changing |
||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| template <typename ArrowType> | ||
| void verifyBloomFilter(const BloomFilter* bloom_filter, | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| const ::arrow::ChunkedArray& chunked_array) { | ||
| auto iter = ::arrow::stl::Begin<ArrowType>(chunked_array); | ||
| auto end = ::arrow::stl::End<ArrowType>(chunked_array); | ||
| while (iter != end) { | ||
| auto value = *iter; | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| if (value == std::nullopt) { | ||
| ++iter; | ||
| continue; | ||
| } | ||
| if constexpr (std::is_same_v<ArrowType, ::arrow::StringType>) { | ||
| ByteArray ba(value.value()); | ||
| EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(&ba))); | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| } else { | ||
| EXPECT_TRUE(bloom_filter->FindHash(bloom_filter->Hash(value.value()))); | ||
| } | ||
| ++iter; | ||
| } | ||
| } | ||
|
|
||
| protected: | ||
| std::vector<std::unique_ptr<BloomFilter>> bloom_filters_; | ||
| }; | ||
|
|
||
| TEST_F(ParquetBloomFilterRoundTripTest, SimpleRoundTrip) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The three test cases below share a lot of common logic (with exactly same data). Should we refactor them to eliminate the duplicate? |
||
| BloomFilterOptions options; | ||
| options.ndv = 100; | ||
| auto writer_properties = WriterProperties::Builder() | ||
| .set_bloom_filter_options(options) | ||
| ->max_row_group_length(4) | ||
| ->build(); | ||
| auto schema = ::arrow::schema( | ||
| {::arrow::field("c0", ::arrow::int64()), ::arrow::field("c1", ::arrow::utf8())}); | ||
| auto table = ::arrow::TableFromJSON(schema, {R"([ | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| [1, "a" ], | ||
| [2, "b" ], | ||
| [3, "c" ], | ||
| [null, "d"], | ||
| [5, null], | ||
| [6, "f" ] | ||
| ])"}); | ||
| WriteFile(writer_properties, table); | ||
|
|
||
| ReadBloomFilters(/*expect_num_row_groups=*/2); | ||
| ASSERT_EQ(4, bloom_filters_.size()); | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| { | ||
| ASSERT_NE(nullptr, bloom_filters_[0]); | ||
| auto col = table->column(0)->Slice(0, 4); | ||
| verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[0].get(), *col); | ||
| } | ||
| { | ||
| ASSERT_NE(nullptr, bloom_filters_[1]); | ||
| auto col = table->column(1)->Slice(0, 4); | ||
| verifyBloomFilter<::arrow::StringType>(bloom_filters_[1].get(), *col); | ||
| } | ||
| { | ||
| ASSERT_NE(nullptr, bloom_filters_[2]); | ||
| auto col = table->column(0)->Slice(4, 2); | ||
| verifyBloomFilter<::arrow::Int64Type>(bloom_filters_[2].get(), *col); | ||
| } | ||
| { | ||
| ASSERT_NE(nullptr, bloom_filters_[3]); | ||
| auto col = table->column(1)->Slice(4, 2); | ||
| verifyBloomFilter<::arrow::StringType>(bloom_filters_[3].get(), *col); | ||
| } | ||
| } | ||
|
|
||
| } // namespace arrow | ||
| } // namespace parquet | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,142 @@ | ||||||
| // Licensed to the Apache Software Foundation (ASF) under one | ||||||
| // or more contributor license agreements. See the NOTICE file | ||||||
| // distributed with this work for additional information | ||||||
| // regarding copyright ownership. The ASF licenses this file | ||||||
| // to you under the Apache License, Version 2.0 (the | ||||||
| // "License"); you may not use this file except in compliance | ||||||
| // with the License. You may obtain a copy of the License at | ||||||
| // | ||||||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||||||
| // | ||||||
| // Unless required by applicable law or agreed to in writing, | ||||||
| // software distributed under the License is distributed on an | ||||||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||||||
| // KIND, either express or implied. See the License for the | ||||||
| // specific language governing permissions and limitations | ||||||
| // under the License. | ||||||
|
|
||||||
| // This module defines an abstract interface for iterating through pages in a | ||||||
| // Parquet column chunk within a row group. It could be extended in the future | ||||||
| // to iterate through all data pages in all chunks in a file. | ||||||
|
|
||||||
| #include "parquet/bloom_filter_builder.h" | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
|
|
||||||
| #include <map> | ||||||
| #include <utility> | ||||||
| #include <vector> | ||||||
|
|
||||||
| #include "arrow/io/interfaces.h" | ||||||
|
|
||||||
| #include "parquet/bloom_filter.h" | ||||||
| #include "parquet/exception.h" | ||||||
| #include "parquet/metadata.h" | ||||||
| #include "parquet/properties.h" | ||||||
|
|
||||||
| namespace parquet { | ||||||
|
|
||||||
| class BloomFilterBuilderImpl : public BloomFilterBuilder { | ||||||
| public: | ||||||
| explicit BloomFilterBuilderImpl(const SchemaDescriptor* schema, | ||||||
| WriterProperties properties) | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the point of making a copy here?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Anyway underlying builder doesn't hold a reference here?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The builder cannot outlive the FileWriter, so why not simply follow other places like this https://github.com/search?q=repo%3Aapache%2Farrow+%22const+WriterProperties*%22&type=code
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||
| : schema_(schema), properties_(std::move(properties)) {} | ||||||
| /// Append a new row group to host all incoming bloom filters. | ||||||
| void AppendRowGroup() override; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't actually appending a new row-group just marking that a row-group is starting so filters should be reset?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. Parquet uses row-group level bloom filter, so this just setup a new row-group for filters |
||||||
|
|
||||||
| BloomFilter* GetOrCreateBloomFilter( | ||||||
| int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) override; | ||||||
|
|
||||||
| /// Serialize all bloom filters with header and bitset in the order of row group and | ||||||
| /// column id. Column encryption is not implemented yet. The side effect is that it | ||||||
| /// deletes all bloom filters after they have been flushed. | ||||||
| void WriteTo(::arrow::io::OutputStream* sink, BloomFilterLocation* location) override; | ||||||
|
|
||||||
| void Finish() override { finished_ = true; } | ||||||
|
|
||||||
| private: | ||||||
| /// Make sure column ordinal is not out of bound and the builder is in good state. | ||||||
| void CheckState(int32_t column_ordinal) const { | ||||||
| if (finished_) { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's checked once per rowgroup, so I don't think this would be heavy And I suspect that compiler can already well handle this under -O2: https://godbolt.org/z/6qvevr3G1 |
||||||
| throw ParquetException("BloomFilterBuilder is already finished."); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe make this message more accurate reflect the user error (WriteTo called multiple times)?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated |
||||||
| } | ||||||
| if (column_ordinal < 0 || column_ordinal >= schema_->num_columns()) { | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| throw ParquetException("Invalid column ordinal: ", column_ordinal); | ||||||
| } | ||||||
| if (row_group_bloom_filters_.empty()) { | ||||||
| throw ParquetException("No row group appended to BloomFilterBuilder."); | ||||||
| } | ||||||
| if (schema_->Column(column_ordinal)->physical_type() == Type::BOOLEAN) { | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| throw ParquetException("BloomFilterBuilder not supports Boolean."); | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| } | ||||||
| } | ||||||
|
|
||||||
| const SchemaDescriptor* schema_; | ||||||
| WriterProperties properties_; | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| bool finished_ = false; | ||||||
|
|
||||||
| // vector: row_group_ordinal | ||||||
| // map: column_ordinal -> bloom filter | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| std::vector<std::map<int32_t, std::unique_ptr<BloomFilter>>> row_group_bloom_filters_; | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| }; | ||||||
|
|
||||||
| std::unique_ptr<BloomFilterBuilder> BloomFilterBuilder::Make( | ||||||
| const SchemaDescriptor* schema, const WriterProperties& properties) { | ||||||
| return std::unique_ptr<BloomFilterBuilder>( | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| new BloomFilterBuilderImpl(schema, properties)); | ||||||
| } | ||||||
|
|
||||||
| void BloomFilterBuilderImpl::AppendRowGroup() { row_group_bloom_filters_.emplace_back(); } | ||||||
|
|
||||||
| BloomFilter* BloomFilterBuilderImpl::GetOrCreateBloomFilter( | ||||||
| int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) { | ||||||
| CheckState(column_ordinal); | ||||||
| std::unique_ptr<BloomFilter>& bloom_filter = | ||||||
| row_group_bloom_filters_.back()[column_ordinal]; | ||||||
| if (bloom_filter == nullptr) { | ||||||
| auto block_split_bloom_filter = | ||||||
| std::make_unique<BlockSplitBloomFilter>(properties_.memory_pool()); | ||||||
| block_split_bloom_filter->Init(BlockSplitBloomFilter::OptimalNumOfBytes( | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a recent discussion on the parquet mailing list about bloom filters and what good writers should do. My take-away was:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Personally I think the best way is to buffering the hash values and making a decision later when hash value too much or buffer is too large. But personally I think we can first make a "static" config and enhance it later
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have reviewed that PR and it could be a followup change. Writer implementation has the freedom to try smart things. FYI, parquet-java also discards the bloom filter if dictionary encoding is applied to all data pages, though I don't think we should do the same thing. |
||||||
| bloom_filter_options.ndv, bloom_filter_options.fpp)); | ||||||
| bloom_filter = std::move(block_split_bloom_filter); | ||||||
| } | ||||||
| return bloom_filter.get(); | ||||||
| } | ||||||
|
|
||||||
| void BloomFilterBuilderImpl::WriteTo(::arrow::io::OutputStream* sink, | ||||||
| BloomFilterLocation* location) { | ||||||
| if (!finished_) { | ||||||
| throw ParquetException("Cannot call WriteTo() to unfinished PageIndexBuilder."); | ||||||
| } | ||||||
| if (row_group_bloom_filters_.empty()) { | ||||||
| // Return quickly if there is no bloom filter | ||||||
| return; | ||||||
| } | ||||||
|
|
||||||
| for (size_t row_group_ordinal = 0; row_group_ordinal < row_group_bloom_filters_.size(); | ||||||
| ++row_group_ordinal) { | ||||||
| const auto& row_group_bloom_filters = row_group_bloom_filters_[row_group_ordinal]; | ||||||
| // the whole row group has no bloom filter | ||||||
| if (row_group_bloom_filters.empty()) { | ||||||
| continue; | ||||||
| } | ||||||
| bool has_valid_bloom_filter = false; | ||||||
| int num_columns = schema_->num_columns(); | ||||||
| std::vector<std::optional<IndexLocation>> locations(num_columns, std::nullopt); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is it maybe better to make this a map. I expect the number of columns with a bloom filter to be relatively small compared to the number of overall columns?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤔 This reuse some structure in PageIndex, however, I think |
||||||
|
|
||||||
| // serialize bloom filter by ascending order of column id | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| for (int32_t column_id = 0; column_id < num_columns; ++column_id) { | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| auto iter = row_group_bloom_filters.find(column_id); | ||||||
| if (iter != row_group_bloom_filters.cend() && iter->second != nullptr) { | ||||||
| PARQUET_ASSIGN_OR_THROW(int64_t offset, sink->Tell()); | ||||||
| iter->second->WriteTo(sink); | ||||||
| PARQUET_ASSIGN_OR_THROW(int64_t pos, sink->Tell()); | ||||||
| has_valid_bloom_filter = true; | ||||||
| locations[column_id] = IndexLocation{offset, static_cast<int32_t>(pos - offset)}; | ||||||
| } | ||||||
| } | ||||||
| if (has_valid_bloom_filter) { | ||||||
|
mapleFU marked this conversation as resolved.
Outdated
|
||||||
| location->bloom_filter_location.emplace(row_group_ordinal, std::move(locations)); | ||||||
| } | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| } // namespace parquet | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| // This module defines an abstract interface for iterating through pages in a | ||
| // Parquet column chunk within a row group. It could be extended in the future | ||
| // to iterate through all data pages in all chunks in a file. | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
|
|
||
| #pragma once | ||
|
|
||
| #include "arrow/io/interfaces.h" | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| #include "parquet/types.h" | ||
|
|
||
| namespace parquet { | ||
|
|
||
| class BloomFilter; | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| class SchemaDescriptor; | ||
| struct BloomFilterOptions; | ||
| struct BloomFilterLocation; | ||
|
|
||
| namespace schema { | ||
| class ColumnPath; | ||
| } | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
|
|
||
| /// \brief Interface for collecting bloom filter of a parquet file. | ||
| class PARQUET_EXPORT BloomFilterBuilder { | ||
|
wgtmac marked this conversation as resolved.
Outdated
|
||
| public: | ||
| /// \brief API convenience to create a BloomFilterBuilder. | ||
|
wgtmac marked this conversation as resolved.
Outdated
|
||
| static std::unique_ptr<BloomFilterBuilder> Make(const SchemaDescriptor* schema, | ||
| const WriterProperties& properties); | ||
|
|
||
| /// Append a new row group to host all incoming bloom filters. | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| virtual void AppendRowGroup() = 0; | ||
|
|
||
| /// \brief Get the BloomFilter from column ordinal. | ||
| /// | ||
| /// \param column_ordinal Column ordinal in schema, which is only for leaf columns. | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| /// \param bloom_filter_options The options(like num distinct values and false positive | ||
| /// rate) to create a BloomFilter. | ||
| /// | ||
| /// \return BloomFilter for the column and its memory ownership belongs to the | ||
| /// BloomFilterBuilder. | ||
| virtual BloomFilter* GetOrCreateBloomFilter( | ||
| int32_t column_ordinal, const BloomFilterOptions& bloom_filter_options) = 0; | ||
|
|
||
| /// \brief Write the bloom filter to sink. | ||
| /// | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| /// \param[out] sink The output stream to write the bloom filter. | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| /// \param[out] location The location of all bloom filter to the start of sink. | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| virtual void WriteTo(::arrow::io::OutputStream* sink, | ||
| BloomFilterLocation* location) = 0; | ||
|
|
||
| /// \brief Complete the bloom filter builder and no more write is allowed. | ||
|
mapleFU marked this conversation as resolved.
Outdated
|
||
| virtual void Finish() = 0; | ||
|
|
||
| virtual ~BloomFilterBuilder() = default; | ||
| }; | ||
|
|
||
| } // namespace parquet | ||
Uh oh!
There was an error while loading. Please reload this page.