diff --git a/include/paimon/global_index/global_index_scan.h b/include/paimon/global_index/global_index_scan.h index 400a5c88..60651514 100644 --- a/include/paimon/global_index/global_index_scan.h +++ b/include/paimon/global_index/global_index_scan.h @@ -63,14 +63,15 @@ class PAIMON_EXPORT GlobalIndexScan { virtual Result> CreateRangeScan( const Range& range) = 0; - /// Returns the set of row ID ranges covered by this global index. + /// Returns row ID ranges covered by this global index (sorted and non-overlapping + /// ranges). /// /// Each `Range` represents a contiguous segment of row IDs for which global index /// data exists. This allows the query engine to parallelize scanning and be aware /// of ranges that are not covered by any global index. /// - /// @return A `Result` containing a set of non-overlapping `Range` objects. - virtual Result> GetRowRangeList() = 0; + /// @return A `Result` containing sorted and non-overlapping `Range` objects. + virtual Result> GetRowRangeList() = 0; }; } // namespace paimon diff --git a/include/paimon/global_index/indexed_split.h b/include/paimon/global_index/indexed_split.h new file mode 100644 index 00000000..73c1ab0e --- /dev/null +++ b/include/paimon/global_index/indexed_split.h @@ -0,0 +1,45 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paimon/table/source/data_split.h" +#include "paimon/utils/range.h" +#include "paimon/visibility.h" + +namespace paimon { +/// Indexed split for global index reading operation. +class PAIMON_EXPORT IndexedSplit : public Split { + public: + /// @returns The underlying physical data split containing actual data file details. + virtual std::shared_ptr GetDataSplit() const = 0; + + /// @returns A list of row intervals [start, end] indicating which rows + /// are relevant (e.g., passed predicate pushdown). + virtual const std::vector& RowRanges() const = 0; + + /// @returns A score for **each individual row** included in `RowRanges()`, + /// in the order they appear when traversing the ranges. + virtual const std::vector& Scores() const = 0; +}; +} // namespace paimon diff --git a/include/paimon/global_index/row_range_global_index_writer.h b/include/paimon/global_index/row_range_global_index_writer.h index a3c397c1..8ecccc0b 100644 --- a/include/paimon/global_index/row_range_global_index_writer.h +++ b/include/paimon/global_index/row_range_global_index_writer.h @@ -18,9 +18,9 @@ #include #include +#include "paimon/global_index/indexed_split.h" #include "paimon/memory/memory_pool.h" #include "paimon/result.h" -#include "paimon/table/source/data_split.h" #include "paimon/utils/range.h" #include "paimon/visibility.h" @@ -35,8 +35,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter { /// @param table_path Path to the table root directory where index files are stored. /// @param field_name Name of the indexed column (must be present in the table schema). /// @param index_type Type of global index to build (e.g., "bitmap", "lumina"). - /// @param split The data split (e.g., Parquet file) containing the actual data. - /// @param range Row ID range [from, to] for data to build index. + /// @param index_split The indexed split containing the actual data (e.g., Parquet file) and + // row id range [from, to] for data to build index. /// The range must be fully contained within the data covered /// by the given `split`. /// @param options Index-specific configuration (e.g., false positive rate for bloom @@ -47,7 +47,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter { /// or an error if indexing fails (e.g., unsupported type, I/O error). static Result> WriteIndex( const std::string& table_path, const std::string& field_name, const std::string& index_type, - const std::shared_ptr& split, const Range& range, + const std::shared_ptr& indexed_split, const std::map& options, const std::shared_ptr& pool); }; diff --git a/include/paimon/read_context.h b/include/paimon/read_context.h index 01504a26..59d7a32c 100644 --- a/include/paimon/read_context.h +++ b/include/paimon/read_context.h @@ -26,7 +26,6 @@ #include "paimon/predicate/predicate.h" #include "paimon/result.h" #include "paimon/type_fwd.h" -#include "paimon/utils/range.h" #include "paimon/visibility.h" namespace paimon { @@ -43,8 +42,8 @@ class PAIMON_EXPORT ReadContext { public: ReadContext(const std::string& path, const std::string& branch, const std::vector& read_schema, - const std::shared_ptr& predicate, const std::vector& row_ranges, - bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count, + const std::shared_ptr& predicate, bool enable_predicate_filter, + bool enable_prefetch, uint32_t prefetch_batch_count, uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch, uint32_t row_to_batch_thread_number, const std::optional& table_schema, const std::shared_ptr& memory_pool, @@ -77,10 +76,6 @@ class PAIMON_EXPORT ReadContext { return predicate_; } - const std::vector& GetRowRanges() const { - return row_ranges_; - } - bool EnablePredicateFilter() const { return enable_predicate_filter_; } @@ -114,7 +109,6 @@ class PAIMON_EXPORT ReadContext { std::string branch_; std::vector read_schema_; std::shared_ptr predicate_; - std::vector row_ranges_; bool enable_predicate_filter_; bool enable_prefetch_; uint32_t prefetch_batch_count_; @@ -273,18 +267,6 @@ class PAIMON_EXPORT ReadContextBuilder { ReadContextBuilder& WithFileSystemSchemeToIdentifierMap( const std::map& fs_scheme_to_identifier_map); - /// Set specific row ranges to read for targeted data access. - /// - /// This is primarily used in data evolution scenarios where only specific rows - /// need to be read. File ranges that do not intersect with the specified row ranges - /// will be filtered out, improving performance by avoiding unnecessary I/O. - /// - /// @param row_ranges Vector of specific row ranges to read. - /// @return Reference to this builder for method chaining. - /// @note If not set, all rows in the selected files will be returned. - /// @note This is commonly used in data evolution mode for selective reading. - ReadContextBuilder& SetRowRanges(const std::vector& row_ranges); - /// Build and return a `ReadContext` instance with input validation. /// @return Result containing the constructed `ReadContext` or an error status. Result> Finish(); diff --git a/include/paimon/table/source/data_split.h b/include/paimon/table/source/data_split.h index 16637ed1..fd7f61bf 100644 --- a/include/paimon/table/source/data_split.h +++ b/include/paimon/table/source/data_split.h @@ -26,42 +26,15 @@ #include "paimon/data/timestamp.h" #include "paimon/memory/memory_pool.h" #include "paimon/result.h" +#include "paimon/table/source/split.h" #include "paimon/visibility.h" namespace paimon { class MemoryPool; -/// Input splits for read operation. Needed by most batch computation engines. Support Serialize and -/// Deserialize, compatible with java version. -class PAIMON_EXPORT DataSplit { +/// Input data split for reading operation. Needed by most batch computation engines. +class PAIMON_EXPORT DataSplit : public Split { public: - virtual ~DataSplit() = default; - - /// Deserialize a `DataSplit` from a binary buffer. - /// - /// Creates a `DataSplit` instance from its serialized binary representation. - /// This is typically used in distributed computing scenarios where splits - /// are transmitted between different nodes or processes. - /// - /// @param buffer Const pointer to the binary data containing the serialized `DataSplit`. - /// @param length Size of the buffer in bytes. - /// @param pool Memory pool for allocating objects during deserialization. - /// @return Result containing the deserialized `DataSplit` or an error status. - static Result> Deserialize(const char* buffer, size_t length, - const std::shared_ptr& pool); - - /// Serialize a `DataSplit` to a binary string. - /// - /// Converts a `DataSplit` instance to its binary representation for storage - /// or transmission. The serialized data can later be deserialized using - /// the Deserialize method. - /// - /// @param data_split The `DataSplit` instance to serialize. - /// @param pool Memory pool for allocating temporary objects during serialization. - /// @return Result containing the serialized binary data as a string or an error status. - static Result Serialize(const std::shared_ptr& data_split, - const std::shared_ptr& pool); - /// Metadata structure for simple data files. /// /// Contains essential information about a data file including its location, @@ -97,6 +70,7 @@ class PAIMON_EXPORT DataSplit { std::optional delete_row_count; bool operator==(const SimpleDataFileMeta& other) const; + std::string ToString() const; }; diff --git a/include/paimon/table/source/plan.h b/include/paimon/table/source/plan.h index 6508401c..ee1e9600 100644 --- a/include/paimon/table/source/plan.h +++ b/include/paimon/table/source/plan.h @@ -20,7 +20,7 @@ #include #include -#include "paimon/table/source/data_split.h" +#include "paimon/table/source/split.h" namespace paimon { /// %Result plan of this `TableScan`. @@ -28,7 +28,7 @@ class PAIMON_EXPORT Plan { public: virtual ~Plan() = default; /// %Result splits. - virtual const std::vector>& Splits() const = 0; + virtual const std::vector>& Splits() const = 0; /// Snapshot id of this plan, return `std::nullopt` if the table is empty. virtual std::optional SnapshotId() const = 0; }; diff --git a/include/paimon/table/source/split.h b/include/paimon/table/source/split.h new file mode 100644 index 00000000..dff58264 --- /dev/null +++ b/include/paimon/table/source/split.h @@ -0,0 +1,66 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paimon/memory/memory_pool.h" +#include "paimon/result.h" +#include "paimon/visibility.h" + +namespace paimon { +class MemoryPool; + +/// An input split for reading operation. Needed by most batch computation engines. Support +/// Serialize and Deserialize, compatible with java version. +/// This split can be either a `DataSplit` (for direct data file reads) or an `IndexedSplit` +/// (for reads leveraging global indexes). +class PAIMON_EXPORT Split { + public: + virtual ~Split() = default; + + /// Deserialize a `Split` from a binary buffer. + /// + /// Creates a `Split` instance from its serialized binary representation. + /// This is typically used in distributed computing scenarios where splits + /// are transmitted between different nodes or processes. + /// + /// @param buffer Const pointer to the binary data containing the serialized `Split`. + /// @param length Size of the buffer in bytes. + /// @param pool Memory pool for allocating objects during deserialization. + /// @return Result containing the deserialized `Split` or an error status. + static Result> Deserialize(const char* buffer, size_t length, + const std::shared_ptr& pool); + + /// Serialize a `Split` to a binary string. + /// + /// Converts a `Split` instance to its binary representation for storage + /// or transmission. The serialized data can later be deserialized using + /// the Deserialize method. + /// + /// @param split The `Split` instance to serialize. + /// @param pool Memory pool for allocating temporary objects during serialization. + /// @return Result containing the serialized binary data as a string or an error status. + static Result Serialize(const std::shared_ptr& split, + const std::shared_ptr& pool); +}; +} // namespace paimon diff --git a/include/paimon/table/source/table_read.h b/include/paimon/table/source/table_read.h index a8044e7d..f8a3a761 100644 --- a/include/paimon/table/source/table_read.h +++ b/include/paimon/table/source/table_read.h @@ -24,15 +24,14 @@ #include "paimon/read_context.h" #include "paimon/reader/batch_reader.h" #include "paimon/result.h" -#include "paimon/table/source/data_split.h" +#include "paimon/table/source/split.h" #include "paimon/visibility.h" namespace paimon { -class DataSplit; class MemoryPool; class ReadContext; -/// Given a `DataSplit` or a list of `DataSplit`, generate a reader for batch reading. +/// Given a `Split` or a list of `Split`, generate a reader for batch reading. class PAIMON_EXPORT TableRead { public: virtual ~TableRead() = default; @@ -46,21 +45,21 @@ class PAIMON_EXPORT TableRead { /// Creates a `BatchReader` instance for reading data. /// /// This method creates a BatchReader that will be responsible for reading data from the - /// provided data splits. + /// provided splits. /// - /// @param data_splits A vector of shared pointers to `DataSplit` instances representing the + /// @param splits A vector of shared pointers to `Split` instances representing the /// data to be read. /// @return A Result containing a unique pointer to the `BatchReader` instance. virtual Result> CreateReader( - const std::vector>& data_splits); + const std::vector>& splits); - /// Creates a `BatchReader` instance for a single data split. + /// Creates a `BatchReader` instance for a single split. /// - /// @param data_split A shared pointer to the `DataSplit` instance that defines the data to be + /// @param split A shared pointer to the `Split` instance that defines the data to be /// read. /// @return A Result containing a unique pointer to the `BatchReader` instance. virtual Result> CreateReader( - const std::shared_ptr& data_split) = 0; + const std::shared_ptr& split) = 0; protected: explicit TableRead(const std::shared_ptr& memory_pool); diff --git a/include/paimon/utils/range.h b/include/paimon/utils/range.h index 3f557804..9cf1dd70 100644 --- a/include/paimon/utils/range.h +++ b/include/paimon/utils/range.h @@ -17,6 +17,7 @@ #pragma once #include #include +#include #include "paimon/visibility.h" @@ -28,9 +29,23 @@ struct PAIMON_EXPORT Range { /// Returns the number of integers in the range [from, to]. int64_t Count() const; + /// Computes the intersection of two ranges. static std::optional Intersection(const Range& left, const Range& right); + + /// Checks whether two ranges have any overlap. static bool HasIntersection(const Range& left, const Range& right); + /// Sorts a list of ranges by `from`, then merges overlapping or adjacent ranges. + /// @param ranges Input vector of ranges to merge. + /// @param adjacent If true, also merges ranges that are adjacent (e.g., [1,3] and [4,5] → + /// [1,5]). + /// If false, only merges strictly overlapping ranges. + /// @return A new vector of non-overlapping, sorted ranges. + static std::vector SortAndMergeOverlap(const std::vector& ranges, bool adjacent); + + /// Computes the set intersection of two collections of disjoint, sorted ranges. + static std::vector And(const std::vector& left, const std::vector& right); + bool operator==(const Range& other) const; bool operator<(const Range& other) const; diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index 9fa42a98..f1e1e58f 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -46,6 +46,7 @@ set(PAIMON_COMMON_SRCS common/fs/file_system.cpp common/fs/resolving_file_system.cpp common/fs/file_system_factory.cpp + common/global_index/complete_index_score_batch_reader.cpp common/global_index/bitmap_topk_global_index_result.cpp common/global_index/bitmap_global_index_result.cpp common/global_index/global_index_result.cpp @@ -227,7 +228,8 @@ set(PAIMON_CORE_SRCS core/table/sink/commit_message_impl.cpp core/table/sink/commit_message_serializer.cpp core/table/source/append_only_table_read.cpp - core/table/source/data_split.cpp + core/table/source/split.cpp + core/table/source/data_split_impl.cpp core/table/source/data_table_batch_scan.cpp core/table/source/data_table_stream_scan.cpp core/table/source/fallback_table_read.cpp @@ -325,6 +327,7 @@ if(PAIMON_BUILD_TESTS) common/file_index/bsi/bit_slice_index_roaring_bitmap_test.cpp common/file_index/bloomfilter/bloom_filter_file_index_test.cpp common/file_index/bloomfilter/fast_hash_test.cpp + common/global_index/complete_index_score_batch_reader_test.cpp common/global_index/global_index_result_test.cpp common/global_index/global_indexer_factory_test.cpp common/global_index/bitmap_global_index_result_test.cpp @@ -445,6 +448,7 @@ if(PAIMON_BUILD_TESTS) core/io/file_index_evaluator_test.cpp core/io/single_file_writer_test.cpp core/io/rolling_blob_file_writer_test.cpp + core/global_index/indexed_split_test.cpp core/manifest/file_source_test.cpp core/manifest/file_kind_test.cpp core/manifest/manifest_entry_writer_test.cpp diff --git a/src/paimon/common/data/blob_utils.cpp b/src/paimon/common/data/blob_utils.cpp index e8a9eadf..50015b70 100644 --- a/src/paimon/common/data/blob_utils.cpp +++ b/src/paimon/common/data/blob_utils.cpp @@ -74,14 +74,10 @@ Result BlobUtils::SeparateBlobArray( } SeparatedStructArrays result; - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( - result.main_array, - arrow::StructArray::Make(remaining_arrays, remaining_fields, struct_array->null_bitmap(), - struct_array->null_count(), struct_array->offset())); - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( - result.blob_array, - arrow::StructArray::Make(blob_arrays, blob_fields, struct_array->null_bitmap(), - struct_array->null_count(), struct_array->offset())); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(result.main_array, + arrow::StructArray::Make(remaining_arrays, remaining_fields)); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(result.blob_array, + arrow::StructArray::Make(blob_arrays, blob_fields)); return result; } diff --git a/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp b/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp index 5ec22393..ab2b5612 100644 --- a/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp +++ b/src/paimon/common/file_index/bitmap/bitmap_file_index_test.cpp @@ -644,10 +644,10 @@ TEST_F(BitmapIndexTest, TestHighCardinalityForCompatibility) { }; // test v1 version - check_result(paimon::test::GetDataDir() + "/fileindex/bitmap-index-v1"); + check_result(paimon::test::GetDataDir() + "/file_index/bitmap-index-v1"); // test v2 version - check_result(paimon::test::GetDataDir() + "/fileindex/bitmap-index-v2"); + check_result(paimon::test::GetDataDir() + "/file_index/bitmap-index-v2"); } TEST_F(BitmapIndexTest, TestHighCardinalityForWriteAndRead) { diff --git a/src/paimon/common/global_index/complete_index_score_batch_reader.cpp b/src/paimon/common/global_index/complete_index_score_batch_reader.cpp new file mode 100644 index 00000000..d622438f --- /dev/null +++ b/src/paimon/common/global_index/complete_index_score_batch_reader.cpp @@ -0,0 +1,104 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/complete_index_score_batch_reader.h" + +#include + +#include "arrow/api.h" +#include "arrow/array/array_base.h" +#include "arrow/array/array_nested.h" +#include "arrow/array/util.h" +#include "arrow/c/abi.h" +#include "arrow/c/bridge.h" +#include "arrow/scalar.h" +#include "paimon/common/reader/reader_utils.h" +#include "paimon/common/table/special_fields.h" +#include "paimon/common/types/row_kind.h" +#include "paimon/common/utils/arrow/mem_utils.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/status.h" +namespace paimon { +CompleteIndexScoreBatchReader::CompleteIndexScoreBatchReader( + std::unique_ptr&& reader, const std::vector& scores, + const std::shared_ptr& pool) + : arrow_pool_(GetArrowPool(pool)), reader_(std::move(reader)), scores_(scores) {} + +Result CompleteIndexScoreBatchReader::NextBatch() { + PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap, + NextBatchWithBitmap()); + return ReaderUtils::ApplyBitmapToReadBatch(std::move(batch_with_bitmap), arrow_pool_.get()); +} + +void CompleteIndexScoreBatchReader::UpdateScoreFieldIndex(const arrow::StructType* struct_type) { + if (index_score_field_idx_ != -1) { + return; + } + index_score_field_idx_ = struct_type->GetFieldIndex(SpecialFields::IndexScore().Name()); + field_names_with_score_.reserve(struct_type->num_fields()); + for (const auto& field : struct_type->fields()) { + field_names_with_score_.push_back(field->name()); + } +} +Result CompleteIndexScoreBatchReader::NextBatchWithBitmap() { + PAIMON_ASSIGN_OR_RAISE(BatchReader::ReadBatchWithBitmap batch_with_bitmap, + reader_->NextBatchWithBitmap()); + if (BatchReader::IsEofBatch(batch_with_bitmap)) { + return batch_with_bitmap; + } + if (scores_.empty()) { + // Indicates score field all null. + return batch_with_bitmap; + } + + auto& [batch, bitmap] = batch_with_bitmap; + auto& [c_array, c_schema] = batch; + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_array, + arrow::ImportArray(c_array.get(), c_schema.get())); + auto struct_array = std::dynamic_pointer_cast(arrow_array); + if (!struct_array) { + return Status::Invalid("cannot cast array to StructArray in CompleteIndexScoreBatchReader"); + } + auto struct_type = struct_array->struct_type(); + UpdateScoreFieldIndex(struct_type); + + // prepare index score array + std::unique_ptr index_score_builder; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::MakeBuilder( + arrow_pool_.get(), SpecialFields::IndexScore().Type(), &index_score_builder)); + auto typed_builder = dynamic_cast(index_score_builder.get()); + assert(typed_builder); + PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->Reserve(struct_array->length())); + bool all_not_null = (struct_array->length() == bitmap.Cardinality()); + for (int64_t i = 0; i < struct_array->length(); i++) { + if (all_not_null || bitmap.Contains(i)) { + PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->Append(scores_[score_cursor_++])); + } else { + PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->AppendNull()); + } + } + std::shared_ptr index_score_array; + PAIMON_RETURN_NOT_OK_FROM_ARROW(typed_builder->Finish(&index_score_array)); + // update index score array to struct array + arrow::ArrayVector array_vec = struct_array->fields(); + array_vec[index_score_field_idx_] = index_score_array; + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array_with_score, + arrow::StructArray::Make(array_vec, field_names_with_score_)); + PAIMON_RETURN_NOT_OK_FROM_ARROW( + arrow::ExportArray(*array_with_score, c_array.get(), c_schema.get())); + return batch_with_bitmap; +} +} // namespace paimon diff --git a/src/paimon/common/global_index/complete_index_score_batch_reader.h b/src/paimon/common/global_index/complete_index_score_batch_reader.h new file mode 100644 index 00000000..72743067 --- /dev/null +++ b/src/paimon/common/global_index/complete_index_score_batch_reader.h @@ -0,0 +1,67 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/array/array_base.h" +#include "paimon/reader/batch_reader.h" +#include "paimon/result.h" + +namespace paimon { +class MemoryPool; +class Metrics; +/// A batch reader that enriches the output Arrow array with index score information. +/// It assumes the input data already contains the `_INDEX_SCORE` column, +/// and ensures this score is properly updated in the returned batches. +/// +/// @pre The read schema must include the `_INDEX_SCORE` field. +class CompleteIndexScoreBatchReader : public BatchReader { + public: + CompleteIndexScoreBatchReader(std::unique_ptr&& reader, + const std::vector& scores, + const std::shared_ptr& pool); + + Result NextBatch() override; + + Result NextBatchWithBitmap() override; + + void Close() override { + reader_->Close(); + } + + std::shared_ptr GetReaderMetrics() const override { + return reader_->GetReaderMetrics(); + } + + private: + void UpdateScoreFieldIndex(const arrow::StructType* struct_type); + + private: + size_t score_cursor_ = 0; + int32_t index_score_field_idx_ = -1; + std::vector field_names_with_score_; + std::unique_ptr arrow_pool_; + std::unique_ptr reader_; + std::vector scores_; +}; +} // namespace paimon diff --git a/src/paimon/common/global_index/complete_index_score_batch_reader_test.cpp b/src/paimon/common/global_index/complete_index_score_batch_reader_test.cpp new file mode 100644 index 00000000..63cda73b --- /dev/null +++ b/src/paimon/common/global_index/complete_index_score_batch_reader_test.cpp @@ -0,0 +1,162 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/complete_index_score_batch_reader.h" + +#include "arrow/api.h" +#include "arrow/array/array_base.h" +#include "arrow/c/abi.h" +#include "arrow/c/bridge.h" +#include "arrow/ipc/json_simple.h" +#include "gtest/gtest.h" +#include "paimon/common/table/special_fields.h" +#include "paimon/common/types/data_field.h" +#include "paimon/format/file_format.h" +#include "paimon/format/file_format_factory.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/status.h" +#include "paimon/testing/mock/mock_file_batch_reader.h" +#include "paimon/testing/utils/read_result_collector.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { +class CompleteIndexScoreBatchReaderTest : public ::testing::Test { + public: + void SetUp() override { + pool_ = GetDefaultPool(); + } + void TearDown() override { + pool_.reset(); + } + + std::unique_ptr PrepareCompleteIndexScoreBatchReader( + const std::shared_ptr& src_array, const RoaringBitmap32& seleced_bitmap, + const std::vector& scores, int32_t batch_size) const { + auto file_batch_reader = std::make_unique(src_array, src_array->type(), + seleced_bitmap, batch_size); + return std::make_unique(std::move(file_batch_reader), scores, + pool_); + } + + std::unique_ptr PrepareCompleteIndexScoreBatchReader( + const std::shared_ptr& src_array, const std::vector& scores, + int32_t batch_size) const { + auto file_batch_reader = + std::make_unique(src_array, src_array->type(), batch_size); + return std::make_unique(std::move(file_batch_reader), scores, + pool_); + } + + private: + std::shared_ptr pool_; +}; + +TEST_F(CompleteIndexScoreBatchReaderTest, TestSimple) { + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), + arrow::field("f1", arrow::int32()), + arrow::field("_INDEX_SCORE", arrow::float32()), + arrow::field("_ROW_ID", arrow::int64()), + }; + + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({fields}), R"([ + ["Alice", 10, null, 0], + ["Bob", 11, null, 1], + ["Cathy", 12, null, 2] + ])") + .ValueOrDie(); + + std::vector scores = {1.23f, 2.34f, 100.10f}; + auto reader = PrepareCompleteIndexScoreBatchReader(src_array, scores, + /*batch_size=*/1); + + ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult(reader.get())); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow::struct_(fields), {R"([ + ["Alice", 10, 1.23, 0], + ["Bob", 11, 2.34, 1], + ["Cathy", 12, 100.10, 2] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + ASSERT_TRUE(expected_array->ApproxEquals(*result_array)); + reader->Close(); +} + +TEST_F(CompleteIndexScoreBatchReaderTest, TestWithBitmap) { + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), + arrow::field("f1", arrow::int32()), + arrow::field("_INDEX_SCORE", arrow::float32()), + arrow::field("_ROW_ID", arrow::int64()), + }; + + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({fields}), R"([ + ["Alice", 10, null, 0], + ["Bob", 11, null, 1], + ["Cathy", 12, null, 2], + ["Davlid", 13, null, 4] + ])") + .ValueOrDie(); + + std::vector scores = {1.23f, -19.12f}; + auto selected_bitmap = RoaringBitmap32::From({0, 3}); + auto reader = PrepareCompleteIndexScoreBatchReader(src_array, selected_bitmap, scores, + /*batch_size=*/2); + + ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult(reader.get())); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow::struct_(fields), {R"([ + ["Alice", 10, 1.23, 0], + ["Davlid", 13, -19.12, 4] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + ASSERT_TRUE(expected_array->ApproxEquals(*result_array)); + reader->Close(); +} + +TEST_F(CompleteIndexScoreBatchReaderTest, TestReadWithNullScores) { + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), + arrow::field("f1", arrow::int32()), + arrow::field("_INDEX_SCORE", arrow::float32()), + arrow::field("_ROW_ID", arrow::int64()), + }; + + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_({fields}), R"([ + ["Alice", 10, null, 0], + ["Bob", 11, null, 1], + ["Cathy", 12, null, 2] + ])") + .ValueOrDie(); + + // scores is empty, indicates all null score + auto reader = PrepareCompleteIndexScoreBatchReader(src_array, /*scores=*/{}, + /*batch_size=*/1); + + ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult(reader.get())); + + auto expected_array = std::make_shared(src_array); + ASSERT_TRUE(expected_array->Equals(*result_array)); + reader->Close(); +} + +} // namespace paimon::test diff --git a/src/paimon/common/io/data_input_stream.cpp b/src/paimon/common/io/data_input_stream.cpp index e1bd2676..32df0254 100644 --- a/src/paimon/common/io/data_input_stream.cpp +++ b/src/paimon/common/io/data_input_stream.cpp @@ -121,4 +121,5 @@ template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; template Result DataInputStream::ReadValue() const; +template Result DataInputStream::ReadValue() const; } // namespace paimon diff --git a/src/paimon/common/table/special_fields.h b/src/paimon/common/table/special_fields.h index 00d397ad..aec06b11 100644 --- a/src/paimon/common/table/special_fields.h +++ b/src/paimon/common/table/special_fields.h @@ -26,6 +26,10 @@ namespace paimon { struct SpecialFields { + static constexpr char KEY_FIELD_PREFIX[] = "_KEY_"; + static constexpr int32_t KEY_VALUE_SPECIAL_FIELD_COUNT = 2; + static constexpr int32_t CPP_FIELD_ID_END = std::numeric_limits::max() - 10000; + static const DataField& SequenceNumber() { static const DataField data_field = DataField(std::numeric_limits::max() - 1, @@ -45,12 +49,15 @@ struct SpecialFields { return data_field; } - static constexpr char KEY_FIELD_PREFIX[] = "_KEY_"; - static constexpr int32_t KEY_VALUE_SPECIAL_FIELD_COUNT = 2; + static const DataField& IndexScore() { + static const DataField data_field = + DataField(CPP_FIELD_ID_END - 1, arrow::field("_INDEX_SCORE", arrow::float32())); + return data_field; + } static bool IsSpecialFieldName(const std::string& field_name) { if (field_name == SequenceNumber().Name() || field_name == ValueKind().Name() || - field_name == RowId().Name()) { + field_name == RowId().Name() || field_name == IndexScore().Name()) { return true; } return false; diff --git a/src/paimon/common/table/special_fields_test.cpp b/src/paimon/common/table/special_fields_test.cpp index 93f95854..da2fdaec 100644 --- a/src/paimon/common/table/special_fields_test.cpp +++ b/src/paimon/common/table/special_fields_test.cpp @@ -23,33 +23,40 @@ namespace paimon::test { -TEST(SpecialFieldsTest, SequenceNumberField) { +TEST(SpecialFieldsTest, TestSequenceNumberField) { ASSERT_EQ(SpecialFields::SequenceNumber().Id(), std::numeric_limits::max() - 1); ASSERT_EQ(SpecialFields::SequenceNumber().Name(), "_SEQUENCE_NUMBER"); ASSERT_EQ(SpecialFields::SequenceNumber().Type()->id(), arrow::Type::INT64); } -TEST(SpecialFieldsTest, ValueKindField) { +TEST(SpecialFieldsTest, TestValueKindField) { ASSERT_EQ(SpecialFields::ValueKind().Id(), std::numeric_limits::max() - 2); ASSERT_EQ(SpecialFields::ValueKind().Name(), "_VALUE_KIND"); ASSERT_EQ(SpecialFields::ValueKind().Type()->id(), arrow::Type::INT8); } -TEST(SpecialFieldsTest, RowIdField) { +TEST(SpecialFieldsTest, TestRowIdField) { ASSERT_EQ(SpecialFields::RowId().Id(), std::numeric_limits::max() - 5); ASSERT_EQ(SpecialFields::RowId().Name(), "_ROW_ID"); ASSERT_EQ(SpecialFields::RowId().Type()->id(), arrow::Type::INT64); } -TEST(SpecialFieldsTest, KeyValueSpecialFieldCount) { +TEST(SpecialFieldsTest, TestIndexScore) { + ASSERT_EQ(SpecialFields::IndexScore().Id(), std::numeric_limits::max() - 10000 - 1); + ASSERT_EQ(SpecialFields::IndexScore().Name(), "_INDEX_SCORE"); + ASSERT_EQ(SpecialFields::IndexScore().Type()->id(), arrow::Type::FLOAT); +} + +TEST(SpecialFieldsTest, TestKeyValueSpecialFieldCount) { ASSERT_EQ(SpecialFields::KEY_VALUE_SPECIAL_FIELD_COUNT, 2); } -TEST(SpecialFieldsTest, IsSpecialFieldName) { +TEST(SpecialFieldsTest, TestIsSpecialFieldName) { ASSERT_TRUE(SpecialFields::IsSpecialFieldName("_SEQUENCE_NUMBER")); ASSERT_TRUE(SpecialFields::IsSpecialFieldName("_VALUE_KIND")); ASSERT_FALSE(SpecialFields::IsSpecialFieldName("VALUE_KIND")); ASSERT_TRUE(SpecialFields::IsSpecialFieldName("_ROW_ID")); + ASSERT_TRUE(SpecialFields::IsSpecialFieldName("_INDEX_SCORE")); } } // namespace paimon::test diff --git a/src/paimon/common/utils/math.h b/src/paimon/common/utils/math.h index b3f93b0a..2430947d 100644 --- a/src/paimon/common/utils/math.h +++ b/src/paimon/common/utils/math.h @@ -27,6 +27,7 @@ #include #include #include +#include #include namespace paimon { @@ -35,31 +36,53 @@ namespace paimon { // encode/decode big endian. template inline T EndianSwapValue(T v) { - static_assert(std::is_integral_v, "non-integral type"); + static_assert(std::is_standard_layout_v && std::is_trivially_copyable_v, + "Type must be standard-layout and trivially copyable (e.g., integral or " + "floating-point types)."); + if constexpr (sizeof(T) == 1) { + return v; + } else if constexpr (std::is_same_v || std::is_same_v || + std::is_integral_v) { + using UintType = std::conditional_t< + sizeof(T) == 2, uint16_t, + std::conditional_t > >; + + static_assert(!std::is_same_v, + "Unsupported size: only 4-byte and 8-byte types are supported."); + + UintType int_repr; + std::memcpy(&int_repr, &v, sizeof(T)); #ifdef _MSC_VER - if (sizeof(T) == 2) { - return static_cast(_byteswap_ushort(static_cast(v))); - } else if (sizeof(T) == 4) { - return static_cast(_byteswap_ulong(static_cast(v))); - } else if (sizeof(T) == 8) { - return static_cast(_byteswap_uint64(static_cast(v))); - } + if constexpr (sizeof(T) == 2) { + int_repr = _byteswap_ushort(static_cast(int_repr)); + } else if constexpr (sizeof(T) == 4) { + int_repr = _byteswap_ulong(static_cast(int_repr)); + } else if constexpr (sizeof(T) == 8) { + int_repr = _byteswap_uint64(static_cast(int_repr)); + } #else - if (sizeof(T) == 2) { - return static_cast(__builtin_bswap16(static_cast(v))); - } else if (sizeof(T) == 4) { - return static_cast(__builtin_bswap32(static_cast(v))); - } else if (sizeof(T) == 8) { - return static_cast(__builtin_bswap64(static_cast(v))); - } + if constexpr (sizeof(T) == 2) { + int_repr = __builtin_bswap16(static_cast(int_repr)); + } else if constexpr (sizeof(T) == 4) { + int_repr = __builtin_bswap32(static_cast(int_repr)); + } else if constexpr (sizeof(T) == 8) { + int_repr = __builtin_bswap64(static_cast(int_repr)); + } #endif - // Recognized by clang as bswap, but not by gcc :( - T ret_val = 0; - for (std::size_t i = 0; i < sizeof(T); ++i) { - ret_val |= ((v >> (8 * i)) & 0xff) << (8 * (sizeof(T) - 1 - i)); + T result; + std::memcpy(&result, &int_repr, sizeof(T)); + return result; + } else { + // Fallback for unsupported sizes (e.g., 16-bit integers on some platforms) + T ret_val{}; + for (std::size_t i = 0; i < sizeof(T); ++i) { + reinterpret_cast(&ret_val)[sizeof(T) - 1 - i] = + reinterpret_cast(&v)[i]; + } + return ret_val; } - return ret_val; } } // namespace paimon diff --git a/src/paimon/common/utils/range.cpp b/src/paimon/common/utils/range.cpp index 3cf58def..abecbce6 100644 --- a/src/paimon/common/utils/range.cpp +++ b/src/paimon/common/utils/range.cpp @@ -15,6 +15,7 @@ */ #include "paimon/utils/range.h" +#include #include #include "fmt/format.h" @@ -27,6 +28,64 @@ int64_t Range::Count() const { return to - from + 1; } +std::vector Range::SortAndMergeOverlap(const std::vector& ranges, bool adjacent) { + if (ranges.empty() || ranges.size() == 1) { + return ranges; + } + // sort + std::vector sorted_ranges = ranges; + std::sort(sorted_ranges.begin(), sorted_ranges.end(), + [](const Range& left, const Range& right) { return left.from < right.from; }); + + std::vector results; + Range current = sorted_ranges[0]; + + for (size_t i = 1; i < sorted_ranges.size(); ++i) { + Range next = sorted_ranges[i]; + // Check if current and next overlap (not just adjacent) + if (current.to + (adjacent ? 1 : 0) >= next.from) { + // Merge: extend current range + current = Range(current.from, std::max(current.to, next.to)); + } else { + // No overlap: add current to result and move to next + results.push_back(current); + current = next; + } + } + // Add the last range + results.push_back(current); + return results; +} + +std::vector Range::And(const std::vector& left, const std::vector& right) { + if (left.empty() || right.empty()) { + return {}; + } + std::vector results; + size_t i = 0; + size_t j = 0; + + while (i < left.size() && j < right.size()) { + const Range& lhs = left[i]; + const Range& rhs = right[j]; + + // Compute intersection of current ranges + std::optional intersect = Range::Intersection(lhs, rhs); + if (intersect) { + results.push_back(intersect.value()); + } + + // Advance the pointer of the range that ends earlier + if (lhs.to <= rhs.to) { + i++; + } else { + j++; + } + } + + return results; +} + std::optional Range::Intersection(const Range& left, const Range& right) { int64_t start = std::max(left.from, right.from); int64_t end = std::min(left.to, right.to); diff --git a/src/paimon/common/utils/range_test.cpp b/src/paimon/common/utils/range_test.cpp index ae2cd008..4ce16af9 100644 --- a/src/paimon/common/utils/range_test.cpp +++ b/src/paimon/common/utils/range_test.cpp @@ -92,4 +92,165 @@ TEST(RangeTest, TestCompare) { ASSERT_TRUE(r1 < r3); ASSERT_TRUE(r3 < r2); } + +TEST(RangeTest, TestSortAndMergeOverlap) { + { + // test simple + std::vector ranges = {Range(0, 10), Range(5, 15)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 15)}; + ASSERT_EQ(result, expected); + } + { + // test no overlap with adjacent = true + std::vector ranges = {Range(0, 10), Range(11, 20)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/true); + std::vector expected = {Range(0, 20)}; + ASSERT_EQ(result, expected); + } + { + // test no overlap with adjacent = false + std::vector ranges = {Range(0, 10), Range(11, 20)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 10), Range(11, 20)}; + ASSERT_EQ(result, expected); + } + { + // test overlap multiple + std::vector ranges = {Range(0, 10), Range(5, 15), Range(12, 20)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 20)}; + ASSERT_EQ(result, expected); + } + { + // test overlap mixed + std::vector ranges = {Range(0, 10), Range(5, 15), Range(20, 30), Range(25, 35)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 15), Range(20, 35)}; + ASSERT_EQ(result, expected); + } + { + // test overlap unsorted + std::vector ranges = {Range(20, 30), Range(0, 10), Range(5, 15)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 15), Range(20, 30)}; + ASSERT_EQ(result, expected); + } + { + // test overlap contained + std::vector ranges = {Range(0, 20), Range(5, 10)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 20)}; + ASSERT_EQ(result, expected); + } + { + // test single + std::vector ranges = {Range(0, 10)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 10)}; + ASSERT_EQ(result, expected); + } + { + // test identical + std::vector ranges = {Range(0, 10), Range(0, 10)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 10)}; + ASSERT_EQ(result, expected); + } + { + // test overlap touching exactly + std::vector ranges = {Range(0, 10), Range(10, 20)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 20)}; + ASSERT_EQ(result, expected); + } + { + // test overlap complex + std::vector ranges = {Range(0, 5), Range(3, 8), Range(10, 15), + Range(20, 25), Range(22, 28), Range(30, 35)}; + auto result = Range::SortAndMergeOverlap(ranges, /*adjacent=*/false); + std::vector expected = {Range(0, 8), Range(10, 15), Range(20, 28), Range(30, 35)}; + ASSERT_EQ(result, expected); + } +} + +TEST(RangeTest, TestAnd) { + { + // test and basic + std::vector left = {Range(0, 10), Range(20, 30)}; + std::vector right = {Range(5, 15), Range(25, 35)}; + auto result = Range::And(left, right); + std::vector expected = {Range(5, 10), Range(25, 30)}; + ASSERT_EQ(result, expected); + } + { + // test no intersection + std::vector left = {Range(0, 10)}; + std::vector right = {Range(20, 30)}; + auto result = Range::And(left, right); + ASSERT_TRUE(result.empty()); + } + { + // test and same ranges + std::vector left = {Range(0, 10)}; + std::vector right = {Range(0, 10)}; + auto result = Range::And(left, right); + std::vector expected = {Range(0, 10)}; + ASSERT_EQ(result, expected); + } + { + // test and partial overlap + std::vector left = {Range(0, 10)}; + std::vector right = {Range(5, 15)}; + auto result = Range::And(left, right); + std::vector expected = {Range(5, 10)}; + ASSERT_EQ(result, expected); + } + { + // test and contained + std::vector left = {Range(0, 20)}; + std::vector right = {Range(5, 10)}; + auto result = Range::And(left, right); + std::vector expected = {Range(5, 10)}; + ASSERT_EQ(result, expected); + } + { + // test and multiple ranges + std::vector left = {Range(0, 10), Range(20, 30), Range(40, 50)}; + std::vector right = {Range(5, 25), Range(35, 45)}; + auto result = Range::And(left, right); + std::vector expected = {Range(5, 10), Range(20, 25), Range(40, 45)}; + ASSERT_EQ(result, expected); + } + { + // test and empty left + std::vector left = {}; + std::vector right = {Range(0, 10)}; + auto result = Range::And(left, right); + ASSERT_TRUE(result.empty()); + } + { + // test and empty right + std::vector left = {Range(0, 10)}; + std::vector right = {}; + auto result = Range::And(left, right); + ASSERT_TRUE(result.empty()); + } + { + // test and touching at boundary + std::vector left = {Range(0, 10)}; + std::vector right = {Range(10, 20)}; + auto result = Range::And(left, right); + std::vector expected = {Range(10, 10)}; + ASSERT_EQ(result, expected); + } + { + // test and complex + std::vector left = {Range(0, 5), Range(10, 15), Range(20, 25), Range(30, 35)}; + std::vector right = {Range(3, 12), Range(18, 28), Range(32, 40)}; + auto result = Range::And(left, right); + std::vector expected = {Range(3, 5), Range(10, 12), Range(20, 25), Range(32, 35)}; + ASSERT_EQ(result, expected); + } +} } // namespace paimon::test diff --git a/src/paimon/core/global_index/global_index_scan_impl.cpp b/src/paimon/core/global_index/global_index_scan_impl.cpp index 2f04c104..b447aeac 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.cpp +++ b/src/paimon/core/global_index/global_index_scan_impl.cpp @@ -61,16 +61,36 @@ Result> GlobalIndexScanImpl::CreateR return std::make_shared(table_schema_, index_file_path_factory, filtered_entries, options_, pool_); } -Result> GlobalIndexScanImpl::GetRowRangeList() { +Result> GlobalIndexScanImpl::GetRowRangeList() { PAIMON_RETURN_NOT_OK(Scan()); - std::set ranges; + std::map> index_ranges; for (const auto& entry : entries_) { const auto& global_index_meta = entry.index_file->GetGlobalIndexMeta(); assert(global_index_meta); - ranges.insert(Range(global_index_meta.value().row_range_start, - global_index_meta.value().row_range_end)); + const auto& index_meta = global_index_meta.value(); + index_ranges[entry.index_file->IndexType()].push_back( + Range(index_meta.row_range_start, index_meta.row_range_end)); } - return ranges; + std::string check_index_type; + std::vector check_ranges; + // check all type index have same shard ranges + // If index a has [1,10],[20,30] and index b has [1,10],[20,25], it's inconsistent, because + // it is hard to handle the [26,30] range. + for (const auto& [type, ranges] : index_ranges) { + if (check_index_type.empty()) { + check_index_type = type; + check_ranges = Range::SortAndMergeOverlap(ranges, /*adjacent=*/true); + } else { + auto merged = Range::SortAndMergeOverlap(ranges, /*adjacent=*/true); + if (merged != check_ranges) { + return Status::Invalid( + fmt::format("Inconsistent row ranges among index types: {} and {}", + check_index_type, type)); + } + } + } + + return check_ranges; } Status GlobalIndexScanImpl::Scan() { diff --git a/src/paimon/core/global_index/global_index_scan_impl.h b/src/paimon/core/global_index/global_index_scan_impl.h index 6f2f7e71..e4b8b2d6 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.h +++ b/src/paimon/core/global_index/global_index_scan_impl.h @@ -34,7 +34,7 @@ class GlobalIndexScanImpl : public GlobalIndexScan { Result> CreateRangeScan( const Range& range) override; - Result> GetRowRangeList() override; + Result> GetRowRangeList() override; private: Status Scan(); diff --git a/src/paimon/core/global_index/indexed_split_impl.h b/src/paimon/core/global_index/indexed_split_impl.h new file mode 100644 index 00000000..9c1bf9ad --- /dev/null +++ b/src/paimon/core/global_index/indexed_split_impl.h @@ -0,0 +1,114 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "fmt/format.h" +#include "fmt/ranges.h" +#include "paimon/core/table/source/data_split_impl.h" +#include "paimon/global_index/indexed_split.h" + +namespace paimon { +class IndexedSplitImpl : public IndexedSplit { + public: + static constexpr int64_t MAGIC = -938472394838495695L; + static constexpr int32_t VERSION = 1; + + IndexedSplitImpl(const std::shared_ptr& data_split, + const std::vector& row_ranges, const std::vector& scores) + : data_split_(data_split), row_ranges_(row_ranges), scores_(scores) {} + IndexedSplitImpl(const std::shared_ptr& data_split, + const std::vector& row_ranges) + : IndexedSplitImpl(data_split, row_ranges, {}) {} + + std::shared_ptr GetDataSplit() const override { + return data_split_; + } + const std::vector& RowRanges() const override { + return row_ranges_; + } + const std::vector& Scores() const override { + return scores_; + } + + bool operator==(const IndexedSplitImpl& other) const { + if (this == &other) { + return true; + } + bool score_equal = + (scores_.size() == other.scores_.size()) && + std::equal(scores_.begin(), scores_.end(), other.scores_.begin(), + [](float left, float right) { return std::abs(left - right) <= kEpsilon; }); + return score_equal && *data_split_ == *(other.data_split_) && + row_ranges_ == other.row_ranges_; + } + + bool TEST_Equal(const IndexedSplitImpl& other) const { + if (this == &other) { + return true; + } + bool score_equal = + (scores_.size() == other.scores_.size()) && + std::equal(scores_.begin(), scores_.end(), other.scores_.begin(), + [](float left, float right) { return std::abs(left - right) <= kEpsilon; }); + + return score_equal && data_split_->TEST_Equal(*other.data_split_) && + row_ranges_ == other.row_ranges_; + } + + std::string ToString() const { + std::vector row_ranges_str_vec; + row_ranges_str_vec.reserve(row_ranges_.size()); + for (const auto& range : row_ranges_) { + row_ranges_str_vec.push_back(range.ToString()); + } + std::string row_ranges_str = fmt::format("[{}]", fmt::join(row_ranges_str_vec, ",")); + std::string scores_str = fmt::format("[{}]", fmt::join(scores_, ",")); + return fmt::format("IndexedSplit{{split={}, rowRanges={}, scores={}}}", + data_split_->ToString(), row_ranges_str, scores_str); + } + + Status Validate() const { + if (row_ranges_.empty()) { + return Status::Invalid("IndexedSplit must have non-empty row ranges"); + } + if (!scores_.empty()) { + size_t row_count = 0; + for (const auto& range : row_ranges_) { + row_count += range.Count(); + } + if (row_count != scores_.size()) { + return Status::Invalid("Scores length does not match row ranges in indexed split."); + } + } + return Status::OK(); + } + + private: + static constexpr float kEpsilon = 1e-5f; + + std::shared_ptr data_split_; + std::vector row_ranges_; + std::vector scores_; +}; +} // namespace paimon diff --git a/src/paimon/core/global_index/indexed_split_test.cpp b/src/paimon/core/global_index/indexed_split_test.cpp new file mode 100644 index 00000000..f048333c --- /dev/null +++ b/src/paimon/core/global_index/indexed_split_test.cpp @@ -0,0 +1,159 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/data/data_define.h" +#include "paimon/core/global_index/indexed_split_impl.h" +#include "paimon/core/table/source/data_split_impl.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/result.h" +#include "paimon/testing/utils/binary_row_generator.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { +TEST(IndexedSplitTest, TestSimple) { + std::string file_name = paimon::test::GetDataDir() + "/global_index/indexed_split-01"; + auto file_system = std::make_unique(); + + ASSERT_OK_AND_ASSIGN(auto input_stream, file_system->Open(file_name)); + std::vector split_bytes(input_stream->Length().value_or(0), 0); + + ASSERT_OK(input_stream->Read(split_bytes.data(), split_bytes.size())); + ASSERT_OK(input_stream->Close()); + + auto pool = GetDefaultPool(); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + + auto result_indexed_split = std::dynamic_pointer_cast(result); + + auto meta1 = std::make_shared( + "file1.orc", 100l, 200l, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), 50l, 249l, 0, 0, + std::vector>(), Timestamp(1765535214349l, 0), 0, nullptr, + FileSource::Append(), std::nullopt, std::nullopt, 50l, std::nullopt); + auto meta2 = std::make_shared( + "file2.orc", 101l, 100l, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), 250l, 349l, 0, 0, + std::vector>(), Timestamp(1765535214349l, 0), 0, nullptr, + FileSource::Append(), std::nullopt, std::nullopt, 250l, std::nullopt); + auto meta3 = std::make_shared( + "file3.orc", 102l, 200l, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), 1000l, 1199l, 0, 0, + std::vector>(), Timestamp(1765535214349, 0), 0, nullptr, + FileSource::Append(), std::nullopt, std::nullopt, 1000l, std::nullopt); + + DataSplitImpl::Builder builder( + /*partition=*/BinaryRow::EmptyRow(), + /*bucket=*/0, /*bucket_path=*/ + "data/test_table/bucket-0", + std::vector>({meta1, meta2, meta3})); + + auto expected_data_split = std::dynamic_pointer_cast( + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build().value()); + + std::vector ranges = {Range(55, 56), Range(270, 270), Range(1001, 1002)}; + auto expected_indexed_split = std::make_shared(expected_data_split, ranges); + + ASSERT_EQ(*result_indexed_split, *expected_indexed_split) << result_indexed_split->ToString(); + ASSERT_OK_AND_ASSIGN(std::string serialize_bytes, Split::Serialize(result_indexed_split, pool)); + ASSERT_EQ(serialize_bytes, std::string((char*)split_bytes.data(), split_bytes.size())); +} + +TEST(IndexedSplitTest, TestIndexedSplitWithScore) { + std::string file_name = paimon::test::GetDataDir() + "/global_index/indexed_split-02"; + auto file_system = std::make_unique(); + + ASSERT_OK_AND_ASSIGN(auto input_stream, file_system->Open(file_name)); + std::vector split_bytes(input_stream->Length().value_or(0), 0); + + ASSERT_OK(input_stream->Read(split_bytes.data(), split_bytes.size())); + ASSERT_OK(input_stream->Close()); + + auto pool = GetDefaultPool(); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + + auto result_indexed_split = std::dynamic_pointer_cast(result); + + auto meta1 = std::make_shared( + "file1.orc", 100l, 200l, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), 50l, 249l, 0, 0, + std::vector>(), Timestamp(1765549435648l, 0), 0, nullptr, + FileSource::Append(), std::nullopt, std::nullopt, 50l, std::nullopt); + auto meta2 = std::make_shared( + "file2.orc", 101l, 100l, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), 250l, 349l, 0, 0, + std::vector>(), Timestamp(1765549435649l, 0), 0, nullptr, + FileSource::Append(), std::nullopt, std::nullopt, 250l, std::nullopt); + auto meta3 = std::make_shared( + "file3.orc", 102l, 200l, BinaryRow::EmptyRow(), BinaryRow::EmptyRow(), + SimpleStats::EmptyStats(), SimpleStats::EmptyStats(), 1000l, 1199l, 0, 0, + std::vector>(), Timestamp(1765549435649l, 0), 0, nullptr, + FileSource::Append(), std::nullopt, std::nullopt, 1000l, std::nullopt); + + DataSplitImpl::Builder builder( + /*partition=*/BinaryRow::EmptyRow(), + /*bucket=*/0, /*bucket_path=*/ + "data/test_table/bucket-0", + std::vector>({meta1, meta2, meta3})); + + auto expected_data_split = std::dynamic_pointer_cast( + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build().value()); + + std::vector ranges = {Range(55, 56), Range(270, 270), Range(1001, 1002)}; + std::vector scores = {1.01f, 2.10f, -1.32f, 4.23f, 50.74f}; + auto expected_indexed_split = + std::make_shared(expected_data_split, ranges, scores); + + ASSERT_EQ(*result_indexed_split, *expected_indexed_split) << result_indexed_split->ToString(); + ASSERT_OK_AND_ASSIGN(std::string serialize_bytes, Split::Serialize(result_indexed_split, pool)); + ASSERT_EQ(serialize_bytes, std::string((char*)split_bytes.data(), split_bytes.size())); +} + +TEST(IndexedSplitTest, TestValidate) { + { + std::vector row_ranges = {Range(10, 20), Range(30, 40)}; + IndexedSplitImpl split(/*data_split=*/nullptr, row_ranges); + ASSERT_OK(split.Validate()); + } + { + std::vector row_ranges = {Range(10, 12), Range(30, 31)}; + std::vector scores = {10.01f, 10.11f, 10.21f, -30.01f, -30.11f}; + IndexedSplitImpl split(/*data_split=*/nullptr, row_ranges, scores); + ASSERT_OK(split.Validate()); + } + { + IndexedSplitImpl split(/*data_split=*/nullptr, /*row_ranges=*/std::vector()); + ASSERT_NOK_WITH_MSG(split.Validate(), "IndexedSplit must have non-empty row ranges"); + } + { + std::vector row_ranges = {Range(10, 12), Range(30, 31)}; + std::vector scores = {10.01f, 10.11f, 10.21f, -30.01f}; + IndexedSplitImpl split(/*data_split=*/nullptr, row_ranges, scores); + ASSERT_NOK_WITH_MSG(split.Validate(), + "Scores length does not match row ranges in indexed split."); + } +} +} // namespace paimon::test diff --git a/src/paimon/core/global_index/row_range_global_index_writer.cpp b/src/paimon/core/global_index/row_range_global_index_writer.cpp index 8560b07f..9eaa29e9 100644 --- a/src/paimon/core/global_index/row_range_global_index_writer.cpp +++ b/src/paimon/core/global_index/row_range_global_index_writer.cpp @@ -70,23 +70,20 @@ Result> CreateGlobalIndexWriter( return indexer->CreateWriter(field.Name(), &c_arrow_schema, index_file_manager, pool); } -Result> CreateBatchReader(const std::string& table_path, - const std::string& field_name, - const std::shared_ptr& split, - const Range& range, - const CoreOptions& core_options, - const std::shared_ptr& pool) { +Result> CreateBatchReader( + const std::string& table_path, const std::string& field_name, + const std::shared_ptr& indexed_split, const CoreOptions& core_options, + const std::shared_ptr& pool) { ReadContextBuilder read_context_builder(table_path); read_context_builder.SetOptions(core_options.ToMap()) .EnablePrefetch(true) .WithMemoryPool(pool) - .SetReadSchema({field_name}) - .SetRowRanges({range}); + .SetReadSchema({field_name}); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, read_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr table_read, TableRead::Create(std::move(read_context))); - return table_read->CreateReader(split); + return table_read->CreateReader(indexed_split); } Result> BuildIndex(const std::string& field_name, @@ -109,10 +106,8 @@ Result> BuildIndex(const std::string& field_name, if (fields.empty()) { return Status::Invalid("array read from batch reader only contains row kind"); } - PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( - std::shared_ptr new_array, - arrow::StructArray::Make(fields, {field_name}, struct_array->null_bitmap(), - struct_array->null_count(), struct_array->offset())); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr new_array, + arrow::StructArray::Make(fields, {field_name})); ::ArrowArray c_new_array; PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*new_array, &c_new_array)); PAIMON_RETURN_NOT_OK(global_index_writer->AddBatch(&c_new_array)); @@ -147,13 +142,19 @@ Result> ToCommitMessage( } // namespace Result> RowRangeGlobalIndexWriter::WriteIndex( const std::string& table_path, const std::string& field_name, const std::string& index_type, - const std::shared_ptr& split, const Range& range, + const std::shared_ptr& indexed_split, const std::map& options, const std::shared_ptr& memory_pool) { - auto data_split = std::dynamic_pointer_cast(split); + auto data_split = std::dynamic_pointer_cast(indexed_split->GetDataSplit()); if (!data_split) { return Status::Invalid("split cannot be casted to data split"); } + const auto& ranges = indexed_split->RowRanges(); + if (ranges.size() != 1) { + return Status::Invalid( + "RowRangeGlobalIndexWriter only supports a single contiguous range."); + } + const auto& range = ranges[0]; std::shared_ptr pool = memory_pool ? memory_pool : GetDefaultPool(); // load schema @@ -186,7 +187,7 @@ Result> RowRangeGlobalIndexWriter::WriteIndex( // create batch reader PAIMON_ASSIGN_OR_RAISE( std::unique_ptr batch_reader, - CreateBatchReader(table_path, field_name, split, range, core_options, pool)); + CreateBatchReader(table_path, field_name, indexed_split, core_options, pool)); // read from data split and write to index writer PAIMON_ASSIGN_OR_RAISE(std::vector global_index_io_metas, diff --git a/src/paimon/core/operation/abstract_split_read.cpp b/src/paimon/core/operation/abstract_split_read.cpp index 1b843a8c..59453d12 100644 --- a/src/paimon/core/operation/abstract_split_read.cpp +++ b/src/paimon/core/operation/abstract_split_read.cpp @@ -65,6 +65,7 @@ Result>> AbstractSplitRead::CreateRawFi const BinaryRow& partition, const std::vector>& data_files, const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const { if (data_files.empty()) { return std::vector>(); @@ -83,7 +84,7 @@ Result>> AbstractSplitRead::CreateRawFi PAIMON_ASSIGN_OR_RAISE( std::unique_ptr file_reader, CreateFieldMappingReader(data_file_path, file, partition, reader_builder.get(), - field_mapping_builder.get(), deletion_file_map, + field_mapping_builder.get(), deletion_file_map, row_ranges, data_file_path_factory)); if (file_reader) { raw_file_readers.push_back(std::move(file_reader)); @@ -169,6 +170,7 @@ Result> AbstractSplitRead::CreateFieldMappingReader const BinaryRow& partition, const ReaderBuilder* reader_builder, const FieldMappingBuilder* field_mapping_builder, const std::unordered_map& deletion_file_map, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const { std::shared_ptr data_schema; if (file_meta->schema_id == context_->GetTableSchema()->Id()) { @@ -209,7 +211,7 @@ Result> AbstractSplitRead::CreateFieldMappingReader PAIMON_ASSIGN_OR_RAISE(std::unique_ptr final_reader, ApplyIndexAndDvReaderIfNeeded( std::move(file_reader), file_meta, all_data_schema, read_schema, - predicate, deletion_file_map, data_file_path_factory)); + predicate, deletion_file_map, row_ranges, data_file_path_factory)); if (!final_reader) { // file is skipped by index or dv return std::unique_ptr(); diff --git a/src/paimon/core/operation/abstract_split_read.h b/src/paimon/core/operation/abstract_split_read.h index 57b0162d..df7a18d7 100644 --- a/src/paimon/core/operation/abstract_split_read.h +++ b/src/paimon/core/operation/abstract_split_read.h @@ -71,6 +71,7 @@ class AbstractSplitRead : public SplitRead { const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const; static std::unordered_map CreateDeletionFileMap( @@ -87,6 +88,7 @@ class AbstractSplitRead : public SplitRead { const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const = 0; // 1. project write cols to data schema @@ -110,6 +112,7 @@ class AbstractSplitRead : public SplitRead { const BinaryRow& partition, const ReaderBuilder* reader_builder, const FieldMappingBuilder* field_mapping_builder, const std::unordered_map& deletion_file_map, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const; static bool NeedCompleteRowTrackingFields(bool row_tracking_enabled, diff --git a/src/paimon/core/operation/data_evolution_split_read.cpp b/src/paimon/core/operation/data_evolution_split_read.cpp index 4fe29a76..191fe17e 100644 --- a/src/paimon/core/operation/data_evolution_split_read.cpp +++ b/src/paimon/core/operation/data_evolution_split_read.cpp @@ -18,10 +18,14 @@ #include "paimon/common/data/blob_utils.h" #include "paimon/common/file_index/bitmap/apply_bitmap_index_batch_reader.h" +#include "paimon/common/global_index/complete_index_score_batch_reader.h" #include "paimon/common/reader/complete_row_kind_batch_reader.h" #include "paimon/common/reader/concat_batch_reader.h" +#include "paimon/common/table/special_fields.h" #include "paimon/common/utils/range_helper.h" #include "paimon/core/core_options.h" +#include "paimon/core/global_index/indexed_split_impl.h" + namespace paimon { Status DataEvolutionSplitRead::BlobBunch::Add(const std::shared_ptr& file) { if (!BlobUtils::IsBlobFile(file->file_name)) { @@ -102,9 +106,35 @@ DataEvolutionSplitRead::DataEvolutionSplitRead( context->GetCoreOptions().GetBranch()), memory_pool, executor) {} +bool DataEvolutionSplitRead::HasIndexScoreField(const std::shared_ptr& read_schema) { + return read_schema->GetFieldIndex(SpecialFields::IndexScore().Name()) != -1; +} + Result> DataEvolutionSplitRead::CreateReader( - const std::shared_ptr& split) { - auto split_impl = dynamic_cast(split.get()); + const std::shared_ptr& split) { + if (auto indexed_split = std::dynamic_pointer_cast(split)) { + PAIMON_RETURN_NOT_OK(indexed_split->Validate()); + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr batch_reader, + InnerCreateReader(indexed_split->GetDataSplit(), indexed_split->RowRanges())); + if (HasIndexScoreField(raw_read_schema_)) { + return std::make_unique(std::move(batch_reader), + indexed_split->Scores(), pool_); + } + return batch_reader; + } else if (auto data_split = std::dynamic_pointer_cast(split)) { + if (HasIndexScoreField(raw_read_schema_)) { + return Status::Invalid( + "Invalid read schema, read _INDEX_SCORE while split cannot cast to IndexedSplit"); + } + return InnerCreateReader(data_split, /*row_ranges=*/{}); + } + return Status::Invalid("Invalid Split, cannot cast to IndexedSplit or DataSplit"); +} + +Result> DataEvolutionSplitRead::InnerCreateReader( + const std::shared_ptr& data_split, const std::vector& row_ranges) const { + auto split_impl = dynamic_cast(data_split.get()); if (split_impl == nullptr) { return Status::Invalid("unexpected error, split cast to impl failed"); } @@ -124,13 +154,13 @@ Result> DataEvolutionSplitRead::CreateReader( std::vector> raw_file_readers, CreateRawFileReaders(split_impl->Partition(), need_merge_files, raw_read_schema_, /*predicate=*/nullptr, - /*deletion_file_map=*/{}, data_file_path_factory)); + /*deletion_file_map=*/{}, row_ranges, data_file_path_factory)); assert(raw_file_readers.size() == 1); sub_readers.push_back(std::move(raw_file_readers[0])); } else { PAIMON_ASSIGN_OR_RAISE(std::unique_ptr evolution_reader, CreateUnionReader(split_impl->Partition(), need_merge_files, - data_file_path_factory)); + row_ranges, data_file_path_factory)); sub_readers.push_back(std::move(evolution_reader)); } } @@ -140,12 +170,12 @@ Result> DataEvolutionSplitRead::CreateReader( ApplyPredicateFilterIfNeeded(std::move(concat_batch_reader), context_->GetPredicate())); return std::make_unique(std::move(batch_reader), pool_); } - Result> DataEvolutionSplitRead::ApplyIndexAndDvReaderIfNeeded( std::unique_ptr&& file_reader, const std::shared_ptr& file, const std::shared_ptr& data_schema, const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const { if (!deletion_file_map.empty()) { return Status::Invalid("DataEvolutionSplitRead do not support deletion vector"); @@ -156,7 +186,7 @@ Result> DataEvolutionSplitRead::ApplyIndexAndDvRead return Status::Invalid("DataEvolutionSplitRead do not support predicate"); } PAIMON_ASSIGN_OR_RAISE(std::optional selection_row_ids, - file->ToFileSelection(context_->GetRowRanges())); + file->ToFileSelection(row_ranges)); ::ArrowSchema c_read_schema; PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*read_schema, &c_read_schema)); PAIMON_RETURN_NOT_OK( @@ -211,6 +241,7 @@ DataEvolutionSplitRead::SplitFieldBunches( Result> DataEvolutionSplitRead::CreateUnionReader( const BinaryRow& partition, const std::vector>& need_merge_files, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const { auto blob_field_to_field_id = [&](const std::shared_ptr& file_meta) -> Result { @@ -232,12 +263,12 @@ Result> DataEvolutionSplitRead::CreateU PAIMON_ASSIGN_OR_RAISE( std::vector> fields_files, SplitFieldBunches(need_merge_files, blob_field_to_field_id, - /*has_row_ranges_selection=*/!context_->GetRowRanges().empty())); + /*has_row_ranges_selection=*/!row_ranges.empty())); assert(!fields_files.empty()); int64_t row_count = fields_files[0]->RowCount(); PAIMON_ASSIGN_OR_RAISE(int64_t first_row_id, fields_files[0]->Files()[0]->NonNullFirstRowId()); - if (context_->GetRowRanges().empty()) { + if (row_ranges.empty()) { for (const auto& bunch : fields_files) { if (bunch->RowCount() != row_count) { return Status::Invalid( @@ -302,7 +333,7 @@ Result> DataEvolutionSplitRead::CreateU PAIMON_ASSIGN_OR_RAISE( std::vector> file_readers, CreateRawFileReaders(partition, bunch->Files(), file_read_schema, - /*predicate=*/nullptr, /*deletion_file_map=*/{}, + /*predicate=*/nullptr, /*deletion_file_map=*/{}, row_ranges, data_file_path_factory)); if (file_readers.size() == 1) { file_batch_readers[file_idx] = std::move(file_readers[0]); @@ -319,30 +350,9 @@ Result> DataEvolutionSplitRead::CreateU field_offsets, pool_); } -Result DataEvolutionSplitRead::Match(const std::shared_ptr& data_split, +Result DataEvolutionSplitRead::Match(const std::shared_ptr& split, bool force_keep_delete) const { - auto split_impl = dynamic_cast(data_split.get()); - if (split_impl == nullptr) { - return Status::Invalid("Unexpected error, split cast to impl failed"); - } - const auto& files = split_impl->DataFiles(); - if (files.size() < 2) { - return false; - } - std::set first_row_ids; - for (const auto& file : files) { - if (BlobUtils::IsBlobFile(file->file_name)) { - return true; - } - std::optional first_row_id = file->first_row_id; - if (first_row_id == std::nullopt || file->file_source == std::nullopt || - file->file_source.value() != FileSource::Append()) { - return false; - } - first_row_ids.insert(first_row_id.value()); - } - // If all files have a distinct first row id, we don't need to merge fields - return first_row_ids.size() != files.size(); + return true; } Result>>> diff --git a/src/paimon/core/operation/data_evolution_split_read.h b/src/paimon/core/operation/data_evolution_split_read.h index a1421cc0..34ed10b3 100644 --- a/src/paimon/core/operation/data_evolution_split_read.h +++ b/src/paimon/core/operation/data_evolution_split_read.h @@ -51,7 +51,7 @@ struct DeletionFile; /// otherwise, it must be present in the read path. /// /// Readers Overview: (ConcatBatchReader across -/// splits)->CompleteRowKindBatchReader->(PredicateBatchReader) +/// splits)->(CompleteIndexScoreBatchReader)->CompleteRowKindBatchReader->(PredicateBatchReader) /// ->ConcatBatchReader across files->DataEvolutionFileReader->(ConcatBatchReader across blob files) /// ->FieldMappingReader->(CompleteRowTrackingFieldsBatchReader) /// ->(DelegatingPrefetchReader)->(PrefetchFileBatchReader)->FormatReader @@ -67,11 +67,9 @@ class DataEvolutionSplitRead : public AbstractSplitRead { const std::shared_ptr& memory_pool, const std::shared_ptr& executor); - Result> CreateReader( - const std::shared_ptr& split) override; + Result> CreateReader(const std::shared_ptr& split) override; - Result Match(const std::shared_ptr& data_split, - bool force_keep_delete) const override; + Result Match(const std::shared_ptr& split, bool force_keep_delete) const override; Result> ApplyIndexAndDvReaderIfNeeded( std::unique_ptr&& file_reader, const std::shared_ptr& file, @@ -79,6 +77,7 @@ class DataEvolutionSplitRead : public AbstractSplitRead { const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const override; private: @@ -129,6 +128,9 @@ class DataEvolutionSplitRead : public AbstractSplitRead { }; private: + Result> InnerCreateReader( + const std::shared_ptr& data_split, const std::vector& row_ranges) const; + static Result>> SplitFieldBunches(const std::vector>& need_merge_files, const std::function(const std::shared_ptr&)>& @@ -137,10 +139,13 @@ class DataEvolutionSplitRead : public AbstractSplitRead { static Result>>> MergeRangesAndSort( std::vector>&& files); + static bool HasIndexScoreField(const std::shared_ptr& read_schema); + private: Result> CreateUnionReader( const BinaryRow& partition, const std::vector>& need_merge_files, + const std::vector& row_ranges, const std::shared_ptr& data_file_path_factory) const; }; diff --git a/src/paimon/core/operation/data_evolution_split_read_test.cpp b/src/paimon/core/operation/data_evolution_split_read_test.cpp index c6745bf6..5f3426ba 100644 --- a/src/paimon/core/operation/data_evolution_split_read_test.cpp +++ b/src/paimon/core/operation/data_evolution_split_read_test.cpp @@ -99,106 +99,6 @@ class DataEvolutionSplitReadTest : public ::testing::Test { std::shared_ptr pool_ = GetDefaultPool(); }; -TEST_F(DataEvolutionSplitReadTest, TestMatch) { - std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; - ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f0", "_ROW_ID", "_SEQUENCE_NUMBER"}); - ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); - SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); - ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); - auto new_options = table_schema->Options(); - new_options[Options::ROW_TRACKING_ENABLED] = "true"; - new_options[Options::DATA_EVOLUTION_ENABLED] = "true"; - ASSERT_OK_AND_ASSIGN( - std::shared_ptr internal_context, - InternalReadContext::Create(std::move(read_context), table_schema, new_options)); - auto split_read = std::make_unique( - /*path_factory=*/nullptr, internal_context, pool_, - CreateDefaultExecutor(/*thread_count=*/2)); - - auto create_data_split = - [](std::vector>&& metas) -> std::shared_ptr { - DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), - /*bucket=*/0, /*bucket_path=*/"", std::move(metas)); - return builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build().value(); - }; - { - // test empty split - auto data_split = create_data_split({}); - ASSERT_OK_AND_ASSIGN(bool match_result, - split_read->Match(data_split, /*force_keep_delete=*/false)); - ASSERT_FALSE(match_result); - } - { - // test only one file - auto meta = CreateDataFileMeta(/*file_name=*/"file1.orc", /*first_row_id=*/std::nullopt, - FileSource::Append()); - auto data_split = create_data_split({meta}); - ASSERT_OK_AND_ASSIGN(bool match_result, - split_read->Match(data_split, /*force_keep_delete=*/false)); - ASSERT_FALSE(match_result); - } - { - // test with null first row id - auto meta1 = - CreateDataFileMeta(/*file_name=*/"file1.orc", /*first_row_id=*/1, FileSource::Append()); - auto meta2 = CreateDataFileMeta(/*file_name=*/"file2.orc", /*first_row_id=*/std::nullopt, - FileSource::Append()); - auto data_split = create_data_split({meta1, meta2}); - ASSERT_OK_AND_ASSIGN(bool match_result, - split_read->Match(data_split, /*force_keep_delete=*/false)); - ASSERT_FALSE(match_result); - } - { - // test with different first row id - auto meta1 = - CreateDataFileMeta(/*file_name=*/"file1.orc", /*first_row_id=*/1, FileSource::Append()); - auto meta2 = - CreateDataFileMeta(/*file_name=*/"file2.orc", /*first_row_id=*/2, FileSource::Append()); - auto data_split = create_data_split({meta1, meta2}); - ASSERT_OK_AND_ASSIGN(bool match_result, - split_read->Match(data_split, /*force_keep_delete=*/false)); - ASSERT_FALSE(match_result); - } - { - // test with compact file source - auto meta1 = - CreateDataFileMeta(/*file_name=*/"file1.orc", /*first_row_id=*/1, FileSource::Append()); - auto meta2 = CreateDataFileMeta(/*file_name=*/"file2.orc", /*first_row_id=*/1, - FileSource::Compact()); - auto data_split = create_data_split({meta1, meta2}); - ASSERT_OK_AND_ASSIGN(bool match_result, - split_read->Match(data_split, /*force_keep_delete=*/false)); - ASSERT_FALSE(match_result); - } - { - // test with null file source - auto meta1 = - CreateDataFileMeta(/*file_name=*/"file1.orc", /*first_row_id=*/1, FileSource::Append()); - auto meta2 = CreateDataFileMeta(/*file_name=*/"file2.orc", /*first_row_id=*/1, - /*file_source=*/std::nullopt); - auto data_split = create_data_split({meta1, meta2}); - ASSERT_OK_AND_ASSIGN(bool match_result, - split_read->Match(data_split, /*force_keep_delete=*/false)); - ASSERT_FALSE(match_result); - } - { - // test match simple - auto meta1 = CreateDataFileMeta(/*file_name=*/"file1.orc", /*first_row_id=*/100, - FileSource::Append()); - auto meta2 = CreateDataFileMeta(/*file_name=*/"file2.orc", /*first_row_id=*/100, - FileSource::Append()); - auto data_split = create_data_split({meta1, meta2}); - ASSERT_OK_AND_ASSIGN(bool match_result, - split_read->Match(data_split, /*force_keep_delete=*/false)); - ASSERT_TRUE(match_result); - // The force_keep_delete parameter is not used in match, so test both values - ASSERT_OK_AND_ASSIGN(bool match_result2, - split_read->Match(data_split, /*force_keep_delete=*/true)); - ASSERT_TRUE(match_result2); - } -} - TEST_F(DataEvolutionSplitReadTest, TestAddSingleBlobEntry) { auto blob_entry = CreateBlobFile("blob1", /*first_row_id=*/0, /*row_count=*/100, diff --git a/src/paimon/core/operation/internal_read_context.cpp b/src/paimon/core/operation/internal_read_context.cpp index 1590f70c..f985562d 100644 --- a/src/paimon/core/operation/internal_read_context.cpp +++ b/src/paimon/core/operation/internal_read_context.cpp @@ -39,7 +39,7 @@ Result> InternalReadContext::Create( std::vector read_data_fields; read_data_fields.reserve(context->GetReadSchema().size()); for (const auto& name : context->GetReadSchema()) { - // if enable row tracking, check special fields + // if enable row tracking or data evolution, check special fields if (core_options.RowTrackingEnabled() && name == SpecialFields::RowId().Name()) { read_data_fields.push_back(SpecialFields::RowId()); continue; @@ -48,6 +48,10 @@ Result> InternalReadContext::Create( read_data_fields.push_back(SpecialFields::SequenceNumber()); continue; } + if (core_options.DataEvolutionEnabled() && name == SpecialFields::IndexScore().Name()) { + read_data_fields.push_back(SpecialFields::IndexScore()); + continue; + } PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema->GetField(name)); read_data_fields.push_back(field); } diff --git a/src/paimon/core/operation/internal_read_context.h b/src/paimon/core/operation/internal_read_context.h index df80b0de..1d063f57 100644 --- a/src/paimon/core/operation/internal_read_context.h +++ b/src/paimon/core/operation/internal_read_context.h @@ -65,9 +65,6 @@ class InternalReadContext { const std::shared_ptr& GetPredicate() const { return read_context_->GetPredicate(); } - const std::vector& GetRowRanges() const { - return read_context_->GetRowRanges(); - } bool EnablePredicateFilter() const { return read_context_->EnablePredicateFilter(); } diff --git a/src/paimon/core/operation/internal_read_context_test.cpp b/src/paimon/core/operation/internal_read_context_test.cpp index 341bfe14..e9875662 100644 --- a/src/paimon/core/operation/internal_read_context_test.cpp +++ b/src/paimon/core/operation/internal_read_context_test.cpp @@ -63,24 +63,25 @@ TEST(InternalReadContext, TestReadWithSpecifiedSchema) { ASSERT_TRUE(internal_context->GetReadSchema()->Equals(expected_schema)); } -TEST(InternalReadContext, TestReadWithRowTrackingFields) { +TEST(InternalReadContext, TestReadWithRowTrackingAndScoreFields) { { // test simple std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; ReadContextBuilder context_builder(path); - context_builder.SetReadSchema({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER"}); + context_builder.SetReadSchema({"f3", "f0", "_ROW_ID", "_SEQUENCE_NUMBER", "_INDEX_SCORE"}); ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); auto new_options = table_schema->Options(); new_options[Options::ROW_TRACKING_ENABLED] = "true"; + new_options[Options::DATA_EVOLUTION_ENABLED] = "true"; ASSERT_OK_AND_ASSIGN( auto internal_context, InternalReadContext::Create(std::move(read_context), table_schema, new_options)); - std::vector read_fields = {DataField(3, arrow::field("f3", arrow::float64())), - DataField(0, arrow::field("f0", arrow::utf8())), - SpecialFields::RowId(), - SpecialFields::SequenceNumber()}; + std::vector read_fields = { + DataField(3, arrow::field("f3", arrow::float64())), + DataField(0, arrow::field("f0", arrow::utf8())), SpecialFields::RowId(), + SpecialFields::SequenceNumber(), SpecialFields::IndexScore()}; auto expected_schema = DataField::ConvertDataFieldsToArrowSchema(read_fields); ASSERT_TRUE(internal_context->GetReadSchema()->Equals(expected_schema)); } @@ -96,6 +97,18 @@ TEST(InternalReadContext, TestReadWithRowTrackingFields) { table_schema->Options()), "Get field _ROW_ID failed: not exist in table schema"); } + { + // test invalid case: disable data evolution while read score fields + std::string path = paimon::test::GetDataDir() + "/orc/append_09.db/append_09"; + ReadContextBuilder context_builder(path); + context_builder.SetReadSchema({"f3", "f0", "_INDEX_SCORE"}); + ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); + SchemaManager schema_manager(std::make_shared(), read_context->GetPath()); + ASSERT_OK_AND_ASSIGN(auto table_schema, schema_manager.ReadSchema(0)); + ASSERT_NOK_WITH_MSG(InternalReadContext::Create(std::move(read_context), table_schema, + table_schema->Options()), + "Get field _INDEX_SCORE failed: not exist in table schema"); + } } } // namespace paimon::test diff --git a/src/paimon/core/operation/merge_file_split_read.cpp b/src/paimon/core/operation/merge_file_split_read.cpp index b653e501..65057476 100644 --- a/src/paimon/core/operation/merge_file_split_read.cpp +++ b/src/paimon/core/operation/merge_file_split_read.cpp @@ -126,7 +126,7 @@ Result> MergeFileSplitRead::Create( } Result> MergeFileSplitRead::CreateReader( - const std::shared_ptr& split) { + const std::shared_ptr& split) { auto data_split = std::dynamic_pointer_cast(split); if (!data_split) { return Status::Invalid("cannot cast split to data_split in MergeFileSplitRead"); @@ -154,6 +154,7 @@ Result> MergeFileSplitRead::ApplyIndexAndDvReaderIf const std::shared_ptr& data_schema, const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& ranges, const std::shared_ptr& data_file_path_factory) const { // merge read does not use index PAIMON_UNIQUE_PTR deletion_vector; @@ -208,7 +209,7 @@ Result> MergeFileSplitRead::CreateNoMergeReader( std::vector> raw_file_readers, CreateRawFileReaders(data_split->Partition(), data_split->DataFiles(), read_schema, only_filter_key ? predicate_for_keys_ : context_->GetPredicate(), - deletion_file_map, data_file_path_factory)); + deletion_file_map, /*row_ranges=*/{}, data_file_path_factory)); auto concat_batch_reader = std::make_unique(std::move(raw_file_readers), pool_); @@ -414,9 +415,10 @@ Result> MergeFileSplitRead::CreateReaderFo const std::shared_ptr& data_file_path_factory) const { // no overlap in a run const auto& data_files = sorted_run.Files(); - PAIMON_ASSIGN_OR_RAISE(std::vector> raw_file_readers, - CreateRawFileReaders(partition, data_files, read_schema_, predicate, - deletion_file_map, data_file_path_factory)); + PAIMON_ASSIGN_OR_RAISE( + std::vector> raw_file_readers, + CreateRawFileReaders(partition, data_files, read_schema_, predicate, deletion_file_map, + /*row_ranges=*/{}, data_file_path_factory)); assert(data_files.size() == raw_file_readers.size()); // KeyValueDataFileRecordReader converts arrow array from format reader to KeyValue objects @@ -445,10 +447,10 @@ Result> MergeFileSplitRead::CreateSortMergeRead return Status::Invalid("only support loser-tree or min-heap sort engine"); } -Result MergeFileSplitRead::Match(const std::shared_ptr& data_split, +Result MergeFileSplitRead::Match(const std::shared_ptr& split, bool force_keep_delete) const { // TODO(yonghao.fyh): just pass split impl - auto split_impl = dynamic_cast(data_split.get()); + auto split_impl = dynamic_cast(split.get()); if (split_impl == nullptr) { return Status::Invalid("unexpected error, split cast to impl failed"); } diff --git a/src/paimon/core/operation/merge_file_split_read.h b/src/paimon/core/operation/merge_file_split_read.h index f34b7c69..ca01cc1a 100644 --- a/src/paimon/core/operation/merge_file_split_read.h +++ b/src/paimon/core/operation/merge_file_split_read.h @@ -80,11 +80,9 @@ class MergeFileSplitRead : public AbstractSplitRead { const std::shared_ptr& context, const std::shared_ptr& memory_pool, const std::shared_ptr& executor); - Result> CreateReader( - const std::shared_ptr& split) override; + Result> CreateReader(const std::shared_ptr& split) override; - Result Match(const std::shared_ptr& data_split, - bool force_keep_delete) const override; + Result Match(const std::shared_ptr& split, bool force_keep_delete) const override; Result> ApplyIndexAndDvReaderIfNeeded( std::unique_ptr&& file_reader, const std::shared_ptr& file, @@ -92,6 +90,7 @@ class MergeFileSplitRead : public AbstractSplitRead { const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& ranges, const std::shared_ptr& data_file_path_factory) const override; private: diff --git a/src/paimon/core/operation/raw_file_split_read.cpp b/src/paimon/core/operation/raw_file_split_read.cpp index 07a555e4..bf30dd6d 100644 --- a/src/paimon/core/operation/raw_file_split_read.cpp +++ b/src/paimon/core/operation/raw_file_split_read.cpp @@ -61,7 +61,7 @@ RawFileSplitRead::RawFileSplitRead(const std::shared_ptr& memory_pool, executor) {} Result> RawFileSplitRead::CreateReader( - const std::shared_ptr& split) { + const std::shared_ptr& split) { auto data_split = std::dynamic_pointer_cast(split); if (!data_split) { return Status::Invalid("cannot cast split to data_split in RawFileSplitRead"); @@ -71,10 +71,10 @@ Result> RawFileSplitRead::CreateReader( PAIMON_ASSIGN_OR_RAISE( std::shared_ptr data_file_path_factory, path_factory_->CreateDataFilePathFactory(data_split->Partition(), data_split->Bucket())); - PAIMON_ASSIGN_OR_RAISE( - std::vector> raw_file_readers, - CreateRawFileReaders(data_split->Partition(), data_split->DataFiles(), raw_read_schema_, - predicate, deletion_file_map, data_file_path_factory)); + PAIMON_ASSIGN_OR_RAISE(std::vector> raw_file_readers, + CreateRawFileReaders(data_split->Partition(), data_split->DataFiles(), + raw_read_schema_, predicate, deletion_file_map, + /*row_ranges=*/{}, data_file_path_factory)); auto concat_batch_reader = std::make_unique(std::move(raw_file_readers), pool_); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr batch_reader, @@ -82,9 +82,9 @@ Result> RawFileSplitRead::CreateReader( return std::make_unique(std::move(batch_reader), pool_); } -Result RawFileSplitRead::Match(const std::shared_ptr& data_split, +Result RawFileSplitRead::Match(const std::shared_ptr& split, bool force_keep_delete) const { - auto split_impl = dynamic_cast(data_split.get()); + auto split_impl = dynamic_cast(split.get()); if (split_impl == nullptr) { return Status::Invalid("unexpected error, split cast to impl failed"); } @@ -112,6 +112,7 @@ Result> RawFileSplitRead::ApplyIndexAndDvReaderIfNe const std::shared_ptr& data_schema, const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& ranges, const std::shared_ptr& data_file_path_factory) const { std::shared_ptr file_index_result; if (options_.FileIndexReadEnabled()) { @@ -153,16 +154,6 @@ Result> RawFileSplitRead::ApplyIndexAndDvReaderIfNe actual_selection.value().Flip(0, file_reader->GetNumberOfRows()); } - // merge row id selection - PAIMON_ASSIGN_OR_RAISE(std::optional selection_row_ids, - file->ToFileSelection(context_->GetRowRanges())); - if (selection_row_ids) { - if (actual_selection) { - actual_selection.value() &= selection_row_ids.value(); - } else { - actual_selection = std::move(selection_row_ids); - } - } if (actual_selection && actual_selection.value().IsEmpty()) { return std::unique_ptr(); } diff --git a/src/paimon/core/operation/raw_file_split_read.h b/src/paimon/core/operation/raw_file_split_read.h index 7f722c24..a1978510 100644 --- a/src/paimon/core/operation/raw_file_split_read.h +++ b/src/paimon/core/operation/raw_file_split_read.h @@ -60,11 +60,9 @@ class RawFileSplitRead : public AbstractSplitRead { const std::shared_ptr& memory_pool, const std::shared_ptr& executor); - Result> CreateReader( - const std::shared_ptr& split) override; + Result> CreateReader(const std::shared_ptr& split) override; - Result Match(const std::shared_ptr& data_split, - bool force_keep_delete) const override; + Result Match(const std::shared_ptr& split, bool force_keep_delete) const override; Result> ApplyIndexAndDvReaderIfNeeded( std::unique_ptr&& file_reader, const std::shared_ptr& file, @@ -72,6 +70,7 @@ class RawFileSplitRead : public AbstractSplitRead { const std::shared_ptr& read_schema, const std::shared_ptr& predicate, const std::unordered_map& deletion_file_map, + const std::vector& ranges, const std::shared_ptr& data_file_path_factory) const override; }; diff --git a/src/paimon/core/operation/read_context.cpp b/src/paimon/core/operation/read_context.cpp index 00472852..1885100a 100644 --- a/src/paimon/core/operation/read_context.cpp +++ b/src/paimon/core/operation/read_context.cpp @@ -27,20 +27,21 @@ namespace paimon { class Predicate; -ReadContext::ReadContext( - const std::string& path, const std::string& branch, const std::vector& read_schema, - const std::shared_ptr& predicate, const std::vector& row_ranges, - bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count, - uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch, - uint32_t row_to_batch_thread_number, const std::optional& table_schema, - const std::shared_ptr& memory_pool, const std::shared_ptr& executor, - const std::map& fs_scheme_to_identifier_map, - const std::map& options) +ReadContext::ReadContext(const std::string& path, const std::string& branch, + const std::vector& read_schema, + const std::shared_ptr& predicate, bool enable_predicate_filter, + bool enable_prefetch, uint32_t prefetch_batch_count, + uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch, + uint32_t row_to_batch_thread_number, + const std::optional& table_schema, + const std::shared_ptr& memory_pool, + const std::shared_ptr& executor, + const std::map& fs_scheme_to_identifier_map, + const std::map& options) : path_(path), branch_(branch), read_schema_(read_schema), predicate_(predicate), - row_ranges_(row_ranges), enable_predicate_filter_(enable_predicate_filter), enable_prefetch_(enable_prefetch), prefetch_batch_count_(prefetch_batch_count), @@ -64,7 +65,6 @@ class ReadContextBuilder::Impl { fs_scheme_to_identifier_map_.clear(); options_.clear(); predicate_.reset(); - row_ranges_.clear(); enable_predicate_filter_ = false; enable_prefetch_ = false; prefetch_batch_count_ = 600; @@ -83,7 +83,6 @@ class ReadContextBuilder::Impl { std::map fs_scheme_to_identifier_map_; std::map options_; std::shared_ptr predicate_; - std::vector row_ranges_; bool enable_predicate_filter_ = false; bool enable_prefetch_ = false; uint32_t prefetch_batch_count_ = 600; @@ -181,11 +180,6 @@ ReadContextBuilder& ReadContextBuilder::WithFileSystemSchemeToIdentifierMap( return *this; } -ReadContextBuilder& ReadContextBuilder::SetRowRanges(const std::vector& row_ranges) { - impl_->row_ranges_ = row_ranges; - return *this; -} - Result> ReadContextBuilder::Finish() { PAIMON_ASSIGN_OR_RAISE(impl_->path_, PathUtil::NormalizePath(impl_->path_)); if (impl_->path_.empty()) { @@ -210,11 +204,10 @@ Result> ReadContextBuilder::Finish() { } auto ctx = std::make_unique( impl_->path_, impl_->branch_, impl_->read_field_names_, impl_->predicate_, - impl_->row_ranges_, impl_->enable_predicate_filter_, impl_->enable_prefetch_, - impl_->prefetch_batch_count_, impl_->prefetch_max_parallel_num_, - impl_->enable_multi_thread_row_to_batch_, impl_->row_to_batch_thread_number_, - impl_->table_schema_, impl_->memory_pool_, impl_->executor_, - impl_->fs_scheme_to_identifier_map_, impl_->options_); + impl_->enable_predicate_filter_, impl_->enable_prefetch_, impl_->prefetch_batch_count_, + impl_->prefetch_max_parallel_num_, impl_->enable_multi_thread_row_to_batch_, + impl_->row_to_batch_thread_number_, impl_->table_schema_, impl_->memory_pool_, + impl_->executor_, impl_->fs_scheme_to_identifier_map_, impl_->options_); impl_->Reset(); return ctx; } diff --git a/src/paimon/core/operation/read_context_test.cpp b/src/paimon/core/operation/read_context_test.cpp index 45e39f8c..58d9e4fb 100644 --- a/src/paimon/core/operation/read_context_test.cpp +++ b/src/paimon/core/operation/read_context_test.cpp @@ -34,7 +34,6 @@ TEST(ReadContextTest, TestSimple) { ASSERT_TRUE(ctx->GetReadSchema().empty()); ASSERT_TRUE(ctx->GetOptions().empty()); ASSERT_FALSE(ctx->GetPredicate()); - ASSERT_TRUE(ctx->GetRowRanges().empty()); ASSERT_FALSE(ctx->EnablePredicateFilter()); ASSERT_FALSE(ctx->EnablePrefetch()); ASSERT_EQ(600, ctx->GetPrefetchBatchCount()); @@ -52,8 +51,6 @@ TEST(ReadContextTest, TestSetContent) { auto predicate = PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f1", FieldType::INT); builder.SetPredicate(predicate); - std::vector row_ranges = {Range(1, 2), Range(4, 5)}; - builder.SetRowRanges(row_ranges); builder.EnablePredicateFilter(true); builder.EnablePrefetch(true); builder.SetPrefetchBatchCount(1200); @@ -70,7 +67,6 @@ TEST(ReadContextTest, TestSetContent) { ASSERT_TRUE(ctx->GetExecutor()); ASSERT_EQ(ctx->GetReadSchema(), std::vector({"f1", "f2"})); ASSERT_EQ(*predicate, *(ctx->GetPredicate())); - ASSERT_EQ(ctx->GetRowRanges(), row_ranges); ASSERT_TRUE(ctx->EnablePredicateFilter()); ASSERT_TRUE(ctx->EnablePrefetch()); ASSERT_EQ(1200, ctx->GetPrefetchBatchCount()); diff --git a/src/paimon/core/operation/split_read.h b/src/paimon/core/operation/split_read.h index 754b7b37..bb45ee68 100644 --- a/src/paimon/core/operation/split_read.h +++ b/src/paimon/core/operation/split_read.h @@ -33,9 +33,9 @@ class SplitRead { virtual ~SplitRead() = default; virtual Result> CreateReader( - const std::shared_ptr& data_split) = 0; + const std::shared_ptr& split) = 0; - virtual Result Match(const std::shared_ptr& data_split, + virtual Result Match(const std::shared_ptr& split, bool force_keep_delete) const = 0; }; diff --git a/src/paimon/core/table/source/append_only_table_read.cpp b/src/paimon/core/table/source/append_only_table_read.cpp index 84dd314d..44626f11 100644 --- a/src/paimon/core/table/source/append_only_table_read.cpp +++ b/src/paimon/core/table/source/append_only_table_read.cpp @@ -38,17 +38,18 @@ AppendOnlyTableRead::AppendOnlyTableRead(const std::shared_ptr(path_factory, context, memory_pool, executor)); + } else { + split_reads_.push_back( + std::make_unique(path_factory, context, memory_pool, executor)); } - split_reads_.push_back( - std::make_unique(path_factory, context, memory_pool, executor)); } Result> AppendOnlyTableRead::CreateReader( - const std::shared_ptr& data_split) { + const std::shared_ptr& split) { for (const auto& read : split_reads_) { - PAIMON_ASSIGN_OR_RAISE(bool matched, read->Match(data_split, /*force_keep_delete=*/false)); + PAIMON_ASSIGN_OR_RAISE(bool matched, read->Match(split, /*force_keep_delete=*/false)); if (matched) { - return read->CreateReader(data_split); + return read->CreateReader(split); } } return Status::Invalid("create reader failed, not read match with data split."); diff --git a/src/paimon/core/table/source/append_only_table_read.h b/src/paimon/core/table/source/append_only_table_read.h index 7b9ed89c..fc1282ff 100644 --- a/src/paimon/core/table/source/append_only_table_read.h +++ b/src/paimon/core/table/source/append_only_table_read.h @@ -29,7 +29,6 @@ namespace paimon { class SplitRead; -class DataSplit; class Executor; class FileStorePathFactory; class InternalReadContext; @@ -43,7 +42,7 @@ class AppendOnlyTableRead : public TableRead { const std::shared_ptr& executor); Result> CreateReader( - const std::shared_ptr& data_split) override; + const std::shared_ptr& data_split) override; private: std::vector> split_reads_; diff --git a/src/paimon/core/table/source/data_split.cpp b/src/paimon/core/table/source/data_split.cpp deleted file mode 100644 index 043f2164..00000000 --- a/src/paimon/core/table/source/data_split.cpp +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/table/source/data_split.h" - -#include - -#include "fmt/format.h" -#include "paimon/common/data/binary_row.h" -#include "paimon/common/io/memory_segment_output_stream.h" -#include "paimon/common/memory/memory_segment_utils.h" -#include "paimon/common/utils/serialization_utils.h" -#include "paimon/core/io/data_file_meta_serializer.h" -#include "paimon/core/table/source/data_split_impl.h" -#include "paimon/core/table/source/deletion_file.h" -#include "paimon/core/table/source/fallback_data_split.h" -#include "paimon/core/utils/object_serializer.h" -#include "paimon/io/byte_array_input_stream.h" -#include "paimon/io/data_input_stream.h" -#include "paimon/memory/bytes.h" -#include "paimon/memory/memory_pool.h" -#include "paimon/status.h" - -namespace paimon { -struct DataFileMeta; - -Result DataSplit::Serialize(const std::shared_ptr& data_split, - const std::shared_ptr& pool) { - MemorySegmentOutputStream out(MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool); - auto data_split_impl = std::dynamic_pointer_cast(data_split); - if (!data_split_impl) { - return Status::Invalid("invalid data split"); - } - out.WriteValue(DataSplitImpl::MAGIC); - out.WriteValue(DataSplitImpl::VERSION); - out.WriteValue(data_split_impl->SnapshotId()); - - PAIMON_RETURN_NOT_OK( - SerializationUtils::SerializeBinaryRow(data_split_impl->Partition(), &out)); - out.WriteValue(data_split_impl->Bucket()); - out.WriteString(data_split_impl->BucketPath()); - - std::optional total_buckets = data_split_impl->TotalBuckets(); - if (total_buckets == std::nullopt) { - out.WriteValue(false); - } else { - out.WriteValue(true); - out.WriteValue(total_buckets.value()); - } - - DataFileMetaSerializer serializer(pool); - PAIMON_RETURN_NOT_OK(serializer.SerializeList(data_split_impl->BeforeFiles(), &out)); - - DeletionFile::SerializeList(data_split_impl->BeforeDeletionFiles(), &out); - PAIMON_RETURN_NOT_OK(serializer.SerializeList(data_split_impl->DataFiles(), &out)); - DeletionFile::SerializeList(data_split_impl->DeletionFiles(), &out); - out.WriteValue(data_split_impl->IsStreaming()); - out.WriteValue(data_split_impl->RawConvertible()); - - PAIMON_UNIQUE_PTR bytes = - MemorySegmentUtils::CopyToBytes(out.Segments(), 0, out.CurrentSize(), pool.get()); - return std::string(bytes->data(), bytes->size()); -} - -Result> DataSplit::Deserialize(const char* buffer, size_t length, - const std::shared_ptr& pool) { - auto input_stream = std::make_shared(buffer, length); - DataInputStream in(input_stream); - int64_t magic = -1; - PAIMON_ASSIGN_OR_RAISE(magic, in.ReadValue()); - int32_t version = 1; - if (magic == DataSplitImpl::MAGIC) { - PAIMON_ASSIGN_OR_RAISE(version, in.ReadValue()); - } - - // version 1 does not write magic number in, so the first long is snapshot id. - int64_t snapshot_id = magic; - if (version != 1) { - PAIMON_ASSIGN_OR_RAISE(snapshot_id, in.ReadValue()); - } - - PAIMON_ASSIGN_OR_RAISE(BinaryRow partition, - SerializationUtils::DeserializeBinaryRow(&in, pool.get())); - int32_t bucket = -1; - PAIMON_ASSIGN_OR_RAISE(bucket, in.ReadValue()); - std::string bucket_path; - PAIMON_ASSIGN_OR_RAISE(bucket_path, in.ReadString()); - - std::optional total_buckets; - if (version >= 6) { - PAIMON_ASSIGN_OR_RAISE(bool total_buckets_exist, in.ReadValue()); - if (total_buckets_exist) { - PAIMON_ASSIGN_OR_RAISE(total_buckets, in.ReadValue()); - } - } - - PAIMON_ASSIGN_OR_RAISE( - std::unique_ptr>> data_file_serializer, - DataSplitImpl::GetFileMetaSerializer(version, pool)); - std::vector> before_files; - PAIMON_ASSIGN_OR_RAISE(before_files, data_file_serializer->DeserializeList(&in)); - // compatible for deletion file - std::vector> before_deletion_files; - PAIMON_ASSIGN_OR_RAISE(before_deletion_files, DeletionFile::DeserializeList(&in, version)); - - std::vector> data_files; - PAIMON_ASSIGN_OR_RAISE(data_files, data_file_serializer->DeserializeList(&in)); - // compatible for deletion file - std::vector> data_deletion_files; - PAIMON_ASSIGN_OR_RAISE(data_deletion_files, DeletionFile::DeserializeList(&in, version)); - - bool is_streaming = false; - PAIMON_ASSIGN_OR_RAISE(is_streaming, in.ReadValue()); - bool raw_convertible = false; - PAIMON_ASSIGN_OR_RAISE(raw_convertible, in.ReadValue()); - - DataSplitImpl::Builder builder(partition, bucket, bucket_path, std::move(data_files)); - builder.WithTotalBuckets(total_buckets) - .WithSnapshot(snapshot_id) - .WithBeforeFiles(std::move(before_files)) - .IsStreaming(is_streaming) - .RawConvertible(raw_convertible); - if (!before_deletion_files.empty()) { - builder.WithBeforeDeletionFiles(before_deletion_files); - } - if (!data_deletion_files.empty()) { - builder.WithDataDeletionFiles(data_deletion_files); - } - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_split, builder.Build()); - - PAIMON_ASSIGN_OR_RAISE(int64_t pos, in.GetPos()); - PAIMON_ASSIGN_OR_RAISE(int64_t stream_length, in.Length()); - if (pos == stream_length) { - return data_split; - } else if (pos == stream_length - 1) { - PAIMON_ASSIGN_OR_RAISE(bool is_fallback, in.ReadValue()); - return std::make_shared(data_split, is_fallback); - } else { - return Status::Invalid( - fmt::format("invalid data split byte stream, remaining {} bytes after deserializing", - stream_length - pos)); - } -} - -bool DataSplit::SimpleDataFileMeta::operator==(const DataSplit::SimpleDataFileMeta& other) const { - if (this == &other) { - return true; - } - return file_path == other.file_path && file_size == other.file_size && - row_count == other.row_count && min_sequence_number == other.min_sequence_number && - max_sequence_number == other.max_sequence_number && schema_id == other.schema_id && - level == other.level && creation_time == other.creation_time && - delete_row_count == other.delete_row_count; -} - -std::string DataSplit::SimpleDataFileMeta::ToString() const { - return fmt::format( - "{{filePath: {}, fileSize: {}, rowCount: {}, minSequenceNumber: {}, maxSequenceNumber: {}, " - "schemaId: {}, level: {}, creationTime: {}, deleteRowCount: {}}}", - file_path, file_size, row_count, min_sequence_number, max_sequence_number, schema_id, level, - creation_time.ToString(), - delete_row_count == std::nullopt ? "null" : std::to_string(delete_row_count.value())); -} - -} // namespace paimon diff --git a/src/paimon/core/table/source/data_split_impl.cpp b/src/paimon/core/table/source/data_split_impl.cpp new file mode 100644 index 00000000..2eace106 --- /dev/null +++ b/src/paimon/core/table/source/data_split_impl.cpp @@ -0,0 +1,159 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/core/table/source/data_split_impl.h" + +namespace paimon { + +bool DataSplit::SimpleDataFileMeta::operator==(const SimpleDataFileMeta& other) const { + if (this == &other) { + return true; + } + return file_path == other.file_path && file_size == other.file_size && + row_count == other.row_count && min_sequence_number == other.min_sequence_number && + max_sequence_number == other.max_sequence_number && schema_id == other.schema_id && + level == other.level && creation_time == other.creation_time && + delete_row_count == other.delete_row_count; +} +std::string DataSplit::SimpleDataFileMeta::ToString() const { + return fmt::format( + "{{filePath: {}, fileSize: {}, rowCount: {}, minSequenceNumber: {}, " + "maxSequenceNumber:{}, schemaId: {}, level: {}, creationTime: {}, deleteRowCount: " + "{}}}", + file_path, file_size, row_count, min_sequence_number, max_sequence_number, schema_id, level, + creation_time.ToString(), + delete_row_count == std::nullopt ? "null" : std::to_string(delete_row_count.value())); +} + +Result> DataSplitImpl::LatestFileCreationEpochMillis() const { + if (data_files_.empty()) { + return std::optional(); + } + int64_t epoch = INT64_MIN; + for (const auto& file : data_files_) { + PAIMON_ASSIGN_OR_RAISE(int64_t epoch_milli, file->CreationTimeEpochMillis()); + epoch = std::max(epoch, epoch_milli); + } + return std::optional(epoch); +} + +int64_t DataSplitImpl::RowCount() const { + int64_t row_count = 0; + for (const auto& file : data_files_) { + row_count += file->row_count; + } + return row_count; +} + +std::vector DataSplitImpl::GetFileList() const { + std::vector result_files; + result_files.reserve(data_files_.size()); + for (const auto& file : data_files_) { + std::string result_file_path; + if (!file->external_path) { + result_file_path = PathUtil::JoinPath(bucket_path_, file->file_name); + } else { + result_file_path = file->external_path.value(); + } + result_files.emplace_back(result_file_path, file->file_size, file->row_count, + file->min_sequence_number, file->max_sequence_number, + file->schema_id, file->level, file->creation_time, + file->delete_row_count); + } + return result_files; +} + +bool DataSplitImpl::operator==(const DataSplitImpl& other) const { + if (this == &other) { + return true; + } + return snapshot_id_ == other.snapshot_id_ && partition_ == other.partition_ && + bucket_ == other.bucket_ && bucket_path_ == other.bucket_path_ && + total_buckets_ == other.total_buckets_ && + ObjectUtils::Equal(before_files_, other.before_files_) && + before_deletion_files_ == other.before_deletion_files_ && + ObjectUtils::Equal(data_files_, other.data_files_) && + data_deletion_files_ == other.data_deletion_files_ && + is_streaming_ == other.is_streaming_ && raw_convertible_ == other.raw_convertible_; +} + +bool DataSplitImpl::TEST_Equal(const DataSplitImpl& other) const { + if (this == &other) { + return true; + } + return snapshot_id_ == other.snapshot_id_ && partition_ == other.partition_ && + bucket_ == other.bucket_ && bucket_path_ == other.bucket_path_ && + total_buckets_ == other.total_buckets_ && + ObjectUtils::TEST_Equal(before_files_, other.before_files_) && + before_deletion_files_ == other.before_deletion_files_ && + ObjectUtils::TEST_Equal(data_files_, other.data_files_) && + data_deletion_files_ == other.data_deletion_files_ && + is_streaming_ == other.is_streaming_ && raw_convertible_ == other.raw_convertible_; +} + +int64_t DataSplitImpl::PartialMergedRowCount() const { + if (!raw_convertible_) { + return 0l; + } + int64_t sum = 0; + for (size_t i = 0; i < data_files_.size(); i++) { + const auto& data_file = data_files_[i]; + if (data_deletion_files_.empty() || data_deletion_files_[i] == std::nullopt) { + sum += data_file->row_count; + } else if (data_deletion_files_[i].value().cardinality != std::nullopt) { + sum += data_file->row_count - data_deletion_files_[i].value().cardinality.value(); + } + } + return sum; +} + +Result>>> +DataSplitImpl::GetFileMetaSerializer(int32_t version, const std::shared_ptr& pool) { + if (version == 1) { + // TODO(xinyu.lxy): C++ paimon do not support data file meta 08 + return Status::NotImplemented("Do not support data file meta 08."); + } else if (version == 2) { + return std::make_unique(pool); + } else if (version == 3 || version == 4) { + return std::make_unique(pool); + } else if (version == 5 || version == 6) { + return std::make_unique(pool); + } else if (version == 7) { + return std::make_unique(pool); + } else if (version == VERSION) { + return std::make_unique(pool); + } else { + return Status::Invalid( + fmt::format("Expecting DataSplit version to be smaller or equal than {}, but found {}.", + VERSION, version)); + } +} + +std::string DataSplitImpl::ToString() const { + return fmt::format( + "snapshotId={}, partition={}, bucket={}, bucketPath={}, totalBuckets={}, " + "beforeFiles={}, " + "beforeDeletionFiles={}, dataFiles={}, dataDeletionFiles={}, isStreaming={}, " + "rawConvertible={}", + snapshot_id_, partition_.ToString(), bucket_, bucket_path_, + total_buckets_ == std::nullopt ? "null" : std::to_string(total_buckets_.value()), + StringUtils::VectorToString(before_files_), + StringUtils::VectorToString(before_deletion_files_), + StringUtils::VectorToString(data_files_), StringUtils::VectorToString(data_deletion_files_), + is_streaming_, raw_convertible_); +} + +} // namespace paimon diff --git a/src/paimon/core/table/source/data_split_impl.h b/src/paimon/core/table/source/data_split_impl.h index 8283899b..ca439d4b 100644 --- a/src/paimon/core/table/source/data_split_impl.h +++ b/src/paimon/core/table/source/data_split_impl.h @@ -79,71 +79,14 @@ class DataSplitImpl : public DataSplit { return raw_convertible_; } - Result> LatestFileCreationEpochMillis() const { - if (data_files_.empty()) { - return std::optional(); - } - int64_t epoch = INT64_MIN; - for (const auto& file : data_files_) { - PAIMON_ASSIGN_OR_RAISE(int64_t epoch_milli, file->CreationTimeEpochMillis()); - epoch = std::max(epoch, epoch_milli); - } - return std::optional(epoch); - } + Result> LatestFileCreationEpochMillis() const; - int64_t RowCount() const { - int64_t row_count = 0; - for (const auto& file : data_files_) { - row_count += file->row_count; - } - return row_count; - } + int64_t RowCount() const; - std::vector GetFileList() const override { - std::vector result_files; - result_files.reserve(data_files_.size()); - for (const auto& file : data_files_) { - std::string result_file_path; - if (!file->external_path) { - result_file_path = PathUtil::JoinPath(bucket_path_, file->file_name); - } else { - result_file_path = file->external_path.value(); - } - result_files.push_back(SimpleDataFileMeta( - {result_file_path, file->file_size, file->row_count, file->min_sequence_number, - file->max_sequence_number, file->schema_id, file->level, file->creation_time, - file->delete_row_count})); - } - return result_files; - } - - bool operator==(const DataSplitImpl& other) const { - if (this == &other) { - return true; - } - return snapshot_id_ == other.snapshot_id_ && partition_ == other.partition_ && - bucket_ == other.bucket_ && bucket_path_ == other.bucket_path_ && - total_buckets_ == other.total_buckets_ && - ObjectUtils::Equal(before_files_, other.before_files_) && - before_deletion_files_ == other.before_deletion_files_ && - ObjectUtils::Equal(data_files_, other.data_files_) && - data_deletion_files_ == other.data_deletion_files_ && - is_streaming_ == other.is_streaming_ && raw_convertible_ == other.raw_convertible_; - } + std::vector GetFileList() const override; - bool TEST_Equal(const DataSplitImpl& other) const { - if (this == &other) { - return true; - } - return snapshot_id_ == other.snapshot_id_ && partition_ == other.partition_ && - bucket_ == other.bucket_ && bucket_path_ == other.bucket_path_ && - total_buckets_ == other.total_buckets_ && - ObjectUtils::TEST_Equal(before_files_, other.before_files_) && - before_deletion_files_ == other.before_deletion_files_ && - ObjectUtils::TEST_Equal(data_files_, other.data_files_) && - data_deletion_files_ == other.data_deletion_files_ && - is_streaming_ == other.is_streaming_ && raw_convertible_ == other.raw_convertible_; - } + bool operator==(const DataSplitImpl& other) const; + bool TEST_Equal(const DataSplitImpl& other) const; /// Obtain merged row count as much as possible. There are two scenarios where accurate row /// count @@ -152,21 +95,7 @@ class DataSplitImpl : public DataSplit { /// 1. raw file and no deletion file. /// /// 2. raw file + deletion file with cardinality. - int64_t PartialMergedRowCount() const { - if (!raw_convertible_) { - return 0l; - } - int64_t sum = 0; - for (size_t i = 0; i < data_files_.size(); i++) { - const auto& data_file = data_files_[i]; - if (data_deletion_files_.empty() || data_deletion_files_[i] == std::nullopt) { - sum += data_file->row_count; - } else if (data_deletion_files_[i].value().cardinality != std::nullopt) { - sum += data_file->row_count - data_deletion_files_[i].value().cardinality.value(); - } - } - return sum; - } + int64_t PartialMergedRowCount() const; // Builder /// Builder for `DataSplitImpl`. @@ -228,40 +157,9 @@ class DataSplitImpl : public DataSplit { }; static Result>>> - GetFileMetaSerializer(int32_t version, const std::shared_ptr& pool) { - if (version == 1) { - // TODO(xinyu.lxy): C++ paimon do not support data file meta 08 - return Status::NotImplemented("Do not support data file meta 08."); - } else if (version == 2) { - return std::make_unique(pool); - } else if (version == 3 || version == 4) { - return std::make_unique(pool); - } else if (version == 5 || version == 6) { - return std::make_unique(pool); - } else if (version == 7) { - return std::make_unique(pool); - } else if (version == VERSION) { - return std::make_unique(pool); - } else { - return Status::Invalid(fmt::format( - "Expecting DataSplit version to be smaller or equal than {}, but found {}.", - VERSION, version)); - } - } + GetFileMetaSerializer(int32_t version, const std::shared_ptr& pool); - std::string ToString() const { - return fmt::format( - "snapshotId={}, partition={}, bucket={}, bucketPath={}, totalBuckets={}, " - "beforeFiles={}, " - "beforeDeletionFiles={}, dataFiles={}, dataDeletionFiles={}, isStreaming={}, " - "rawConvertible={}", - snapshot_id_, partition_.ToString(), bucket_, bucket_path_, - total_buckets_ == std::nullopt ? "null" : std::to_string(total_buckets_.value()), - StringUtils::VectorToString(before_files_), - StringUtils::VectorToString(before_deletion_files_), - StringUtils::VectorToString(data_files_), - StringUtils::VectorToString(data_deletion_files_), is_streaming_, raw_convertible_); - } + std::string ToString() const; private: DataSplitImpl(const BinaryRow& partition, int32_t bucket, const std::string& bucket_path, diff --git a/src/paimon/core/table/source/data_split_test.cpp b/src/paimon/core/table/source/data_split_test.cpp index 8592399c..d27b20a7 100644 --- a/src/paimon/core/table/source/data_split_test.cpp +++ b/src/paimon/core/table/source/data_split_test.cpp @@ -54,9 +54,8 @@ TEST(DataSplitTest, TestDeserializeVersion8WithWriteColsAndExternalPath) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-72b62a5f-d698-4db5-b51a-04c0dc027702-0.orc", /*file_size=*/961, /*row_count=*/5, @@ -100,8 +99,7 @@ TEST(DataSplitTest, TestDeserializeVersion8WithWriteColsAndExternalPath) { .Build() .value()); ASSERT_EQ(*result_data_split, *expected_data_split) << result_data_split->ToString(); - ASSERT_OK_AND_ASSIGN(std::string serialize_bytes, - DataSplit::Serialize(result_data_split, pool)); + ASSERT_OK_AND_ASSIGN(std::string serialize_bytes, Split::Serialize(result_data_split, pool)); ASSERT_EQ(serialize_bytes, std::string(split_bytes.data(), split_bytes.size())); } @@ -119,9 +117,8 @@ TEST(DataSplitTest, TestDeserializeVersion8WithWriteCols) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-aa87291d-2a90-4846-b106-1bb4c76d74db-0.orc", /*file_size=*/961, /*row_count=*/5, @@ -164,8 +161,7 @@ TEST(DataSplitTest, TestDeserializeVersion8WithWriteCols) { .Build() .value()); ASSERT_EQ(*result_data_split, *expected_data_split) << result_data_split->ToString(); - ASSERT_OK_AND_ASSIGN(std::string serialize_bytes, - DataSplit::Serialize(result_data_split, pool)); + ASSERT_OK_AND_ASSIGN(std::string serialize_bytes, Split::Serialize(result_data_split, pool)); ASSERT_EQ(serialize_bytes, std::string(split_bytes.data(), split_bytes.size())); } @@ -184,9 +180,8 @@ TEST(DataSplitTest, TestDeserializeVersion7WithFirstRowId) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-92480a4b-ec0b-4585-883a-a99679870c4d-0.orc", /*file_size=*/653, /*row_count=*/5, @@ -218,7 +213,7 @@ TEST(DataSplitTest, TestDeserializeVersion7WithFirstRowId) { .Build() .value()); ASSERT_EQ(*result_data_split, *expected_data_split); - ASSERT_OK(DataSplit::Serialize(result_data_split, pool)); + ASSERT_OK(Split::Serialize(result_data_split, pool)); } TEST(DataSplitTest, TestDeserializeVersion7WithNullFirstRowId) { @@ -235,9 +230,8 @@ TEST(DataSplitTest, TestDeserializeVersion7WithNullFirstRowId) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-16bd83f7-282a-479a-9968-0868436516b0-0.orc", /*file_size=*/567, /*row_count=*/1, @@ -268,7 +262,7 @@ TEST(DataSplitTest, TestDeserializeVersion7WithNullFirstRowId) { .Build() .value()); ASSERT_EQ(*result_data_split, *expected_data_split); - ASSERT_OK(DataSplit::Serialize(result_data_split, pool)); + ASSERT_OK(Split::Serialize(result_data_split, pool)); } TEST(DataSplitTest, TestDeserializeVersion6PkWithTotalBuckets) { @@ -284,9 +278,8 @@ TEST(DataSplitTest, TestDeserializeVersion6PkWithTotalBuckets) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-d7725088-6bd4-4e70-9ce6-714ae93b47cc-0.orc", /*file_size=*/863, /*row_count=*/1, @@ -317,7 +310,7 @@ TEST(DataSplitTest, TestDeserializeVersion6PkWithTotalBuckets) { .Build() .value()); ASSERT_EQ(*result_data_split, *expected_data_split); - ASSERT_OK(DataSplit::Serialize(result_data_split, pool)); + ASSERT_OK(Split::Serialize(result_data_split, pool)); } TEST(DataSplitTest, TestDeserializeVersion5PkWithExternalPath) { @@ -332,9 +325,8 @@ TEST(DataSplitTest, TestDeserializeVersion5PkWithExternalPath) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-80110e15-97b5-4bcf-ac09-6ca2659a4950-0.orc", /*file_size=*/645, /*row_count=*/5, @@ -366,7 +358,7 @@ TEST(DataSplitTest, TestDeserializeVersion5PkWithExternalPath) { /*creation_time=*/Timestamp(1737111915429ll, 0), /*delete_row_count=*/0})}; ASSERT_EQ(file_list, result_data_split->GetFileList()); - ASSERT_OK(DataSplit::Serialize(result_data_split, pool)); + ASSERT_OK(Split::Serialize(result_data_split, pool)); } TEST(DataSplitTest, TestDeserializeVersion5PkWithEmptyExternalPath) { @@ -381,9 +373,8 @@ TEST(DataSplitTest, TestDeserializeVersion5PkWithEmptyExternalPath) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-64d93fc3-eaf2-4253-9cff-a9faa701e207-0.orc", /*file_size=*/645, /*row_count=*/5, @@ -408,7 +399,7 @@ TEST(DataSplitTest, TestDeserializeVersion5PkWithEmptyExternalPath) { ASSERT_EQ(*result_data_split, *expected_data_split); ASSERT_EQ(5, expected_data_split->PartialMergedRowCount()); - ASSERT_OK(DataSplit::Serialize(result_data_split, pool)); + ASSERT_OK(Split::Serialize(result_data_split, pool)); } TEST(DataSplitTest, TestDeserializeVersion4PkWithSnapshot4WithDvCardinality) { @@ -424,9 +415,8 @@ TEST(DataSplitTest, TestDeserializeVersion4PkWithSnapshot4WithDvCardinality) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( "data-2ffe7ae9-2cf7-41e9-944b-2065585cde31-0.orc", /*file_size=*/1318, /*row_count=*/7, @@ -479,9 +469,8 @@ TEST(DataSplitTest, TestDeserializeVersion3AppendWithSnapshot1) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( @@ -519,9 +508,8 @@ TEST(DataSplitTest, TestDeserializeVersion3AppendWithSnapshot1WithStatsDenseStor ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( @@ -557,9 +545,8 @@ TEST(DataSplitTest, TestDeserializeAppendWithSnapshot1) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( @@ -596,9 +583,8 @@ TEST(DataSplitTest, TestDeserializeAppendWithSnapshot3) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta1 = std::make_shared( @@ -667,9 +653,8 @@ TEST(DataSplitTest, TestDeserializeAppendWithSnapshot5) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( @@ -707,9 +692,8 @@ TEST(DataSplitTest, TestDeserializePkWithSnapshot2) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( @@ -758,9 +742,8 @@ TEST(DataSplitTest, TestDeserializePkWithSnapshot6OfSingleFile) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( @@ -807,9 +790,8 @@ TEST(DataSplitTest, TestDeserializePkWithSnapshot6OfMultiFiles) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta1 = std::make_shared( @@ -901,9 +883,8 @@ TEST(DataSplitTest, TestDeserializePkWithSnapshot8) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta = std::make_shared( @@ -951,9 +932,8 @@ TEST(DataSplitTest, TestDeserializePk10WithSnapshot6) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto result_data_split = std::dynamic_pointer_cast(result); auto file_meta1 = std::make_shared( diff --git a/src/paimon/core/table/source/data_table_batch_scan.cpp b/src/paimon/core/table/source/data_table_batch_scan.cpp index cc13202e..80e1eb58 100644 --- a/src/paimon/core/table/source/data_table_batch_scan.cpp +++ b/src/paimon/core/table/source/data_table_batch_scan.cpp @@ -70,8 +70,8 @@ Result> DataTableBatchScan::ApplyPushDownLimit( if (push_down_limit_ == std::nullopt) { return current_scan_result->GetPlan(); } - std::vector> splits = current_scan_result->Splits(); - std::vector> limited_data_splits; + std::vector> splits = current_scan_result->Splits(); + std::vector> limited_data_splits; limited_data_splits.reserve(splits.size()); int64_t scanned_row_count = 0; for (const auto& split : splits) { diff --git a/src/paimon/core/table/source/fallback_data_split_test.cpp b/src/paimon/core/table/source/fallback_data_split_test.cpp index 18921d67..b2977b96 100644 --- a/src/paimon/core/table/source/fallback_data_split_test.cpp +++ b/src/paimon/core/table/source/fallback_data_split_test.cpp @@ -50,9 +50,8 @@ TEST(FallbackDataSplitTest, TestDeserialize) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto fallback_data_split = std::dynamic_pointer_cast(result); ASSERT_TRUE(fallback_data_split); // not fallback branch @@ -120,9 +119,8 @@ TEST(FallbackDataSplitTest, TestDeserialize2) { ASSERT_OK(input_stream->Close()); auto pool = GetDefaultPool(); - ASSERT_OK_AND_ASSIGN( - std::shared_ptr result, - DataSplit::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); + ASSERT_OK_AND_ASSIGN(std::shared_ptr result, + Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool)); auto fallback_data_split = std::dynamic_pointer_cast(result); ASSERT_TRUE(fallback_data_split); // is fallback branch diff --git a/src/paimon/core/table/source/fallback_table_read.cpp b/src/paimon/core/table/source/fallback_table_read.cpp index d0420e93..cc9907d9 100644 --- a/src/paimon/core/table/source/fallback_table_read.cpp +++ b/src/paimon/core/table/source/fallback_table_read.cpp @@ -23,7 +23,7 @@ namespace paimon { Result> FallbackTableRead::CreateReader( - const std::shared_ptr& split) { + const std::shared_ptr& split) { auto fallback_data_split = std::dynamic_pointer_cast(split); if (fallback_data_split) { if (fallback_data_split->IsFallback()) { diff --git a/src/paimon/core/table/source/fallback_table_read.h b/src/paimon/core/table/source/fallback_table_read.h index 289d33d4..98b404a7 100644 --- a/src/paimon/core/table/source/fallback_table_read.h +++ b/src/paimon/core/table/source/fallback_table_read.h @@ -36,8 +36,7 @@ class FallbackTableRead : public TableRead { main_table_(std::move(main_table)), fallback_table_(std::move(fallback_table)) {} - Result> CreateReader( - const std::shared_ptr& data_split) override; + Result> CreateReader(const std::shared_ptr& split) override; private: std::unique_ptr main_table_; diff --git a/src/paimon/core/table/source/key_value_table_read.cpp b/src/paimon/core/table/source/key_value_table_read.cpp index 523d199f..75633118 100644 --- a/src/paimon/core/table/source/key_value_table_read.cpp +++ b/src/paimon/core/table/source/key_value_table_read.cpp @@ -50,7 +50,11 @@ Result> KeyValueTableRead::Create( } Result> KeyValueTableRead::CreateReader( - const std::shared_ptr& data_split) { + const std::shared_ptr& split) { + auto data_split = std::dynamic_pointer_cast(split); + if (!data_split) { + return Status::Invalid("split cannot be casted to DataSplit"); + } for (const auto& read : split_reads_) { PAIMON_ASSIGN_OR_RAISE(bool matched, read->Match(data_split, force_keep_delete_)); if (matched) { diff --git a/src/paimon/core/table/source/key_value_table_read.h b/src/paimon/core/table/source/key_value_table_read.h index 6999b9e1..d72a8dc9 100644 --- a/src/paimon/core/table/source/key_value_table_read.h +++ b/src/paimon/core/table/source/key_value_table_read.h @@ -27,7 +27,7 @@ #include "paimon/table/source/table_read.h" namespace paimon { -class DataSplit; +class Split; class Executor; class FileStorePathFactory; class InternalReadContext; @@ -40,8 +40,7 @@ class KeyValueTableRead : public TableRead { const std::shared_ptr& context, const std::shared_ptr& memory_pool, const std::shared_ptr& executor); - Result> CreateReader( - const std::shared_ptr& data_split) override; + Result> CreateReader(const std::shared_ptr& split) override; private: KeyValueTableRead(std::vector>&& split_reads, diff --git a/src/paimon/core/table/source/plan_impl.cpp b/src/paimon/core/table/source/plan_impl.cpp index c20ef04d..34a3acab 100644 --- a/src/paimon/core/table/source/plan_impl.cpp +++ b/src/paimon/core/table/source/plan_impl.cpp @@ -17,11 +17,10 @@ #include "paimon/core/table/source/plan_impl.h" namespace paimon { -class DataSplit; const std::shared_ptr PlanImpl::EmptyPlan() { - static const std::shared_ptr empty_plan = std::make_shared( - std::optional(), std::vector>()); + static const std::shared_ptr empty_plan = + std::make_shared(std::optional(), std::vector>()); return empty_plan; } diff --git a/src/paimon/core/table/source/plan_impl.h b/src/paimon/core/table/source/plan_impl.h index 461fd4cc..8a87bc9e 100644 --- a/src/paimon/core/table/source/plan_impl.h +++ b/src/paimon/core/table/source/plan_impl.h @@ -24,20 +24,19 @@ #include "paimon/table/source/plan.h" namespace paimon { -class DataSplit; /// An implementation of `Plan`. class PlanImpl : public Plan { public: PlanImpl(const std::optional& snapshot_id, - const std::vector>& splits) + const std::vector>& splits) : snapshot_id_(snapshot_id), splits_(splits) {} std::optional SnapshotId() const override { return snapshot_id_; } - const std::vector>& Splits() const override { + const std::vector>& Splits() const override { return splits_; } @@ -45,6 +44,6 @@ class PlanImpl : public Plan { private: std::optional snapshot_id_; - std::vector> splits_; + std::vector> splits_; }; } // namespace paimon diff --git a/src/paimon/core/table/source/snapshot/snapshot_reader.cpp b/src/paimon/core/table/source/snapshot/snapshot_reader.cpp index ac26e779..fac299ae 100644 --- a/src/paimon/core/table/source/snapshot/snapshot_reader.cpp +++ b/src/paimon/core/table/source/snapshot/snapshot_reader.cpp @@ -42,16 +42,16 @@ Result> SnapshotReader::Read() const { FileStoreScan::RawPlan::GroupFiles files = FileStoreScan::RawPlan::GroupByPartFiles(raw_plan->Files(FileKind::Add())); PAIMON_ASSIGN_OR_RAISE( - std::vector> data_splits, + std::vector> data_splits, GenerateSplits(snapshot, scan_mode_ != ScanMode::ALL, split_generator_, std::move(files))); return std::make_shared(raw_plan->SnapshotId(), data_splits); } -Result>> SnapshotReader::GenerateSplits( +Result>> SnapshotReader::GenerateSplits( const std::optional& snapshot, bool is_streaming, const std::unique_ptr& split_generator, FileStoreScan::RawPlan::GroupFiles&& grouped_manifest_entries) const { - std::vector> splits; + std::vector> splits; // Read deletion indexes at once to reduce file IO std::unordered_map, std::vector>> deletion_index_files_map; diff --git a/src/paimon/core/table/source/snapshot/snapshot_reader.h b/src/paimon/core/table/source/snapshot/snapshot_reader.h index 16d5d4ee..03a7ebca 100644 --- a/src/paimon/core/table/source/snapshot/snapshot_reader.h +++ b/src/paimon/core/table/source/snapshot/snapshot_reader.h @@ -88,7 +88,7 @@ class SnapshotReader { Result> Read() const; private: - Result>> GenerateSplits( + Result>> GenerateSplits( const std::optional& snapshot, bool is_streaming, const std::unique_ptr& split_generator, FileStoreScan::RawPlan::GroupFiles&& grouped_data_files) const; diff --git a/src/paimon/core/table/source/snapshot/starting_scanner.h b/src/paimon/core/table/source/snapshot/starting_scanner.h index e28414ae..41bc8497 100644 --- a/src/paimon/core/table/source/snapshot/starting_scanner.h +++ b/src/paimon/core/table/source/snapshot/starting_scanner.h @@ -42,7 +42,7 @@ class StartingScanner { return plan_->SnapshotId().value(); } - const std::vector>& Splits() const { + const std::vector>& Splits() const { return plan_->Splits(); } diff --git a/src/paimon/core/table/source/split.cpp b/src/paimon/core/table/source/split.cpp new file mode 100644 index 00000000..2d188cd5 --- /dev/null +++ b/src/paimon/core/table/source/split.cpp @@ -0,0 +1,238 @@ +/* + * Copyright 2025-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "fmt/format.h" +#include "paimon/common/data/binary_row.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/common/utils/serialization_utils.h" +#include "paimon/core/global_index/indexed_split_impl.h" +#include "paimon/core/io/data_file_meta_serializer.h" +#include "paimon/core/table/source/data_split_impl.h" +#include "paimon/core/table/source/deletion_file.h" +#include "paimon/core/table/source/fallback_data_split.h" +#include "paimon/core/utils/object_serializer.h" +#include "paimon/global_index/indexed_split.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/memory/bytes.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/status.h" +#include "paimon/table/source/data_split.h" +namespace paimon { +struct DataFileMeta; +namespace { +Status WriteDataSplit(const std::shared_ptr& data_split_impl, + MemorySegmentOutputStream* out, const std::shared_ptr& pool) { + out->WriteValue(DataSplitImpl::MAGIC); + out->WriteValue(DataSplitImpl::VERSION); + out->WriteValue(data_split_impl->SnapshotId()); + + PAIMON_RETURN_NOT_OK(SerializationUtils::SerializeBinaryRow(data_split_impl->Partition(), out)); + out->WriteValue(data_split_impl->Bucket()); + out->WriteString(data_split_impl->BucketPath()); + + std::optional total_buckets = data_split_impl->TotalBuckets(); + if (total_buckets == std::nullopt) { + out->WriteValue(false); + } else { + out->WriteValue(true); + out->WriteValue(total_buckets.value()); + } + + DataFileMetaSerializer serializer(pool); + PAIMON_RETURN_NOT_OK(serializer.SerializeList(data_split_impl->BeforeFiles(), out)); + + DeletionFile::SerializeList(data_split_impl->BeforeDeletionFiles(), out); + PAIMON_RETURN_NOT_OK(serializer.SerializeList(data_split_impl->DataFiles(), out)); + DeletionFile::SerializeList(data_split_impl->DeletionFiles(), out); + out->WriteValue(data_split_impl->IsStreaming()); + out->WriteValue(data_split_impl->RawConvertible()); + return Status::OK(); +} + +Result> ReadDataSplitWithoutMagicNumber( + int64_t magic, DataInputStream* in, const std::shared_ptr& pool) { + int32_t version = 1; + if (magic == DataSplitImpl::MAGIC) { + PAIMON_ASSIGN_OR_RAISE(version, in->ReadValue()); + } + + // version 1 does not write magic number in, so the first long is snapshot id. + int64_t snapshot_id = magic; + if (version != 1) { + PAIMON_ASSIGN_OR_RAISE(snapshot_id, in->ReadValue()); + } + + PAIMON_ASSIGN_OR_RAISE(BinaryRow partition, + SerializationUtils::DeserializeBinaryRow(in, pool.get())); + int32_t bucket = -1; + PAIMON_ASSIGN_OR_RAISE(bucket, in->ReadValue()); + std::string bucket_path; + PAIMON_ASSIGN_OR_RAISE(bucket_path, in->ReadString()); + + std::optional total_buckets; + if (version >= 6) { + PAIMON_ASSIGN_OR_RAISE(bool total_buckets_exist, in->ReadValue()); + if (total_buckets_exist) { + PAIMON_ASSIGN_OR_RAISE(total_buckets, in->ReadValue()); + } + } + + PAIMON_ASSIGN_OR_RAISE( + std::unique_ptr>> data_file_serializer, + DataSplitImpl::GetFileMetaSerializer(version, pool)); + std::vector> before_files; + PAIMON_ASSIGN_OR_RAISE(before_files, data_file_serializer->DeserializeList(in)); + // compatible for deletion file + std::vector> before_deletion_files; + PAIMON_ASSIGN_OR_RAISE(before_deletion_files, DeletionFile::DeserializeList(in, version)); + + std::vector> data_files; + PAIMON_ASSIGN_OR_RAISE(data_files, data_file_serializer->DeserializeList(in)); + // compatible for deletion file + std::vector> data_deletion_files; + PAIMON_ASSIGN_OR_RAISE(data_deletion_files, DeletionFile::DeserializeList(in, version)); + + bool is_streaming = false; + PAIMON_ASSIGN_OR_RAISE(is_streaming, in->ReadValue()); + bool raw_convertible = false; + PAIMON_ASSIGN_OR_RAISE(raw_convertible, in->ReadValue()); + + DataSplitImpl::Builder builder(partition, bucket, bucket_path, std::move(data_files)); + builder.WithTotalBuckets(total_buckets) + .WithSnapshot(snapshot_id) + .WithBeforeFiles(std::move(before_files)) + .IsStreaming(is_streaming) + .RawConvertible(raw_convertible); + if (!before_deletion_files.empty()) { + builder.WithBeforeDeletionFiles(before_deletion_files); + } + if (!data_deletion_files.empty()) { + builder.WithDataDeletionFiles(data_deletion_files); + } + return builder.Build(); +} + +} // namespace + +Result Split::Serialize(const std::shared_ptr& split, + const std::shared_ptr& pool) { + MemorySegmentOutputStream out(MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool); + if (auto data_split_impl = std::dynamic_pointer_cast(split)) { + PAIMON_RETURN_NOT_OK(WriteDataSplit(data_split_impl, &out, pool)); + } else if (auto indexed_split_impl = std::dynamic_pointer_cast(split)) { + out.WriteValue(IndexedSplitImpl::MAGIC); + out.WriteValue(IndexedSplitImpl::VERSION); + auto inner_split_impl = + std::dynamic_pointer_cast(indexed_split_impl->GetDataSplit()); + if (!inner_split_impl) { + return Status::Invalid("inner split in IndexedSplit is supposed to be DataSplit"); + } + PAIMON_RETURN_NOT_OK(WriteDataSplit(inner_split_impl, &out, pool)); + auto row_ranges = indexed_split_impl->RowRanges(); + out.WriteValue(row_ranges.size()); + for (const auto& range : row_ranges) { + out.WriteValue(range.from); + out.WriteValue(range.to); + } + + auto scores = indexed_split_impl->Scores(); + if (!scores.empty()) { + out.WriteValue(true); + out.WriteValue(scores.size()); + for (const auto& score : scores) { + out.WriteValue(score); + } + } else { + out.WriteValue(false); + } + } else { + return Status::Invalid("invalid split, cannot cast to DataSplit or IndexedSplit"); + } + PAIMON_UNIQUE_PTR bytes = + MemorySegmentUtils::CopyToBytes(out.Segments(), 0, out.CurrentSize(), pool.get()); + return std::string(bytes->data(), bytes->size()); +} + +Result> Split::Deserialize(const char* buffer, size_t length, + const std::shared_ptr& pool) { + auto input_stream = std::make_shared(buffer, length); + DataInputStream in(input_stream); + + int64_t magic = -1; + PAIMON_ASSIGN_OR_RAISE(magic, in.ReadValue()); + + if (magic == IndexedSplitImpl::MAGIC) { + PAIMON_ASSIGN_OR_RAISE(int32_t version, in.ReadValue()); + if (version != IndexedSplitImpl::VERSION) { + return Status::Invalid(fmt::format("Unsupported IndexedSplit version: {}", version)); + } + PAIMON_ASSIGN_OR_RAISE(int64_t data_split_magic, in.ReadValue()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_split, + ReadDataSplitWithoutMagicNumber(data_split_magic, &in, pool)); + PAIMON_ASSIGN_OR_RAISE(int32_t range_size, in.ReadValue()); + std::vector row_ranges; + row_ranges.reserve(range_size); + for (int32_t i = 0; i < range_size; ++i) { + PAIMON_ASSIGN_OR_RAISE(int64_t range_from, in.ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int64_t range_to, in.ReadValue()); + row_ranges.emplace_back(range_from, range_to); + } + std::vector scores; + PAIMON_ASSIGN_OR_RAISE(bool has_scores, in.ReadValue()); + if (has_scores) { + PAIMON_ASSIGN_OR_RAISE(int32_t scores_length, in.ReadValue()); + scores.resize(scores_length); + for (int32_t i = 0; i < scores_length; ++i) { + PAIMON_ASSIGN_OR_RAISE(float score, in.ReadValue()); + scores[i] = score; + } + } + // TODO(lisizhuo.lsz): support fallback split in IndexedSplit + PAIMON_ASSIGN_OR_RAISE(int64_t pos, in.GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t stream_length, in.Length()); + if (pos == stream_length) { + return std::make_shared(data_split, row_ranges, scores); + } else if (pos == stream_length - 1) { + return Status::Invalid( + "invalid IndexedSplit, do not support FallbackSplit in IndexedSplit"); + } else { + return Status::Invalid( + fmt::format("invalid IndexedSplit, remaining {} bytes after deserializing", + stream_length - pos)); + } + } else if (magic == DataSplitImpl::MAGIC) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_split, + ReadDataSplitWithoutMagicNumber(magic, &in, pool)); + PAIMON_ASSIGN_OR_RAISE(int64_t pos, in.GetPos()); + PAIMON_ASSIGN_OR_RAISE(int64_t stream_length, in.Length()); + if (pos == stream_length) { + return data_split; + } else if (pos == stream_length - 1) { + PAIMON_ASSIGN_OR_RAISE(bool is_fallback, in.ReadValue()); + return std::make_shared(data_split, is_fallback); + } else { + return Status::Invalid(fmt::format( + "invalid data split byte stream, remaining {} bytes after deserializing", + stream_length - pos)); + } + } + return Status::Invalid("invalid split, must be DataSplit or IndexedSplit"); +} +} // namespace paimon diff --git a/src/paimon/core/table/source/table_read.cpp b/src/paimon/core/table/source/table_read.cpp index df956f1e..439b6a06 100644 --- a/src/paimon/core/table/source/table_read.cpp +++ b/src/paimon/core/table/source/table_read.cpp @@ -144,10 +144,10 @@ Result> TableRead::Create(std::unique_ptr> TableRead::CreateReader( - const std::vector>& data_splits) { + const std::vector>& splits) { std::vector> batch_readers; - batch_readers.reserve(data_splits.size()); - for (const auto& split : data_splits) { + batch_readers.reserve(splits.size()); + for (const auto& split : splits) { PAIMON_ASSIGN_OR_RAISE(std::unique_ptr reader, CreateReader(split)); batch_readers.emplace_back(std::move(reader)); } diff --git a/src/paimon/testing/mock/mock_file_batch_reader.h b/src/paimon/testing/mock/mock_file_batch_reader.h index efcac8a1..6a923a29 100644 --- a/src/paimon/testing/mock/mock_file_batch_reader.h +++ b/src/paimon/testing/mock/mock_file_batch_reader.h @@ -115,6 +115,9 @@ class MockFileBatchReader : public FileBatchReader { } int32_t batch_end_pos = std::min(read_end_pos_, current_pos_ + actual_batch_size); auto slice = data_->Slice(current_pos_, batch_end_pos - current_pos_); + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW( + std::shared_ptr concat_slice, + arrow::Concatenate({slice}, arrow::default_memory_pool())); RoaringBitmap32 bitmap; for (auto iter = bitmap_.EqualOrLarger(current_pos_); iter != bitmap_.End() && *iter < batch_end_pos; ++iter) { @@ -128,7 +131,7 @@ class MockFileBatchReader : public FileBatchReader { std::unique_ptr c_array = std::make_unique(); std::unique_ptr c_schema = std::make_unique(); PAIMON_RETURN_NOT_OK_FROM_ARROW( - arrow::ExportArray(*slice, c_array.get(), c_schema.get())); + arrow::ExportArray(*concat_slice, c_array.get(), c_schema.get())); return std::make_pair(std::make_pair(std::move(c_array), std::move(c_schema)), std::move(bitmap)); } diff --git a/src/paimon/testing/utils/test_helper.h b/src/paimon/testing/utils/test_helper.h index 96ddc020..36a4de96 100644 --- a/src/paimon/testing/utils/test_helper.h +++ b/src/paimon/testing/utils/test_helper.h @@ -141,9 +141,9 @@ class TestHelper { return commit_messages; } - Result>> NewScan(StartupMode startup_mode, - std::optional snapshot_id, - bool is_streaming = true) { + Result>> NewScan(StartupMode startup_mode, + std::optional snapshot_id, + bool is_streaming = true) { ScanContextBuilder scan_context_builder(table_path_); scan_context_builder.WithStreamingMode(is_streaming) .SetOptions(options_) @@ -157,7 +157,7 @@ class TestHelper { return Scan(); } - Result>> Scan() { + Result>> Scan() { if (scan_ == nullptr) { return Status::Invalid("need call NewScan first"); } @@ -246,8 +246,7 @@ class TestHelper { Result ReadAndCheckResultForBlobTable( const std::shared_ptr& all_columns_schema, - const std::vector>& splits, - const std::string& main_expected_json, + const std::vector>& splits, const std::string& main_expected_json, const std::vector>& expected_blob_descriptors) { ReadContextBuilder read_context_builder(table_path_); read_context_builder.SetOptions(options_); @@ -302,7 +301,7 @@ class TestHelper { } Result ReadAndCheckResult(const std::shared_ptr& data_type, - const std::vector>& splits, + const std::vector>& splits, const std::string& expected_result) { ReadContextBuilder read_context_builder(table_path_); read_context_builder.SetOptions(options_); diff --git a/test/inte/blob_table_inte_test.cpp b/test/inte/blob_table_inte_test.cpp index df6eaef8..bd527e97 100644 --- a/test/inte/blob_table_inte_test.cpp +++ b/test/inte/blob_table_inte_test.cpp @@ -40,9 +40,11 @@ #include "paimon/common/table/special_fields.h" #include "paimon/common/utils/path_util.h" #include "paimon/common/utils/scope_guard.h" +#include "paimon/core/global_index/indexed_split_impl.h" #include "paimon/core/schema/schema_manager.h" #include "paimon/core/snapshot.h" #include "paimon/core/stats/simple_stats.h" +#include "paimon/core/table/source/data_split_impl.h" #include "paimon/core/utils/file_utils.h" #include "paimon/core/utils/snapshot_manager.h" #include "paimon/data/blob.h" @@ -156,6 +158,36 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter return file_store_commit->Commit(commit_msgs); } + Result>> CreateReadSplit( + const std::vector>& splits, + const std::vector& row_ranges) const { + if (row_ranges.empty()) { + return splits; + } + // TODO(xinyu.lxy): mv to DataEvolutionBatchScan + std::vector sorted_row_ranges = + Range::SortAndMergeOverlap(row_ranges, /*adjacent=*/true); + std::vector> indexed_splits; + indexed_splits.reserve(splits.size()); + for (const auto& split : splits) { + auto data_split = std::dynamic_pointer_cast(split); + if (!data_split) { + return Status::Invalid("Cannot cast split to DataSplit when create IndexedSplit"); + } + std::vector file_ranges; + file_ranges.reserve(data_split->DataFiles().size()); + for (const auto& meta : data_split->DataFiles()) { + PAIMON_ASSIGN_OR_RAISE(int64_t first_row_id, meta->NonNullFirstRowId()); + file_ranges.emplace_back(first_row_id, first_row_id + meta->row_count - 1); + } + auto sorted_file_ranges = Range::SortAndMergeOverlap(file_ranges, /*adjacent=*/true); + std::vector expected = Range::And(sorted_file_ranges, sorted_row_ranges); + // TODO(xinyu.lxy): add scores + indexed_splits.push_back(std::make_shared(data_split, expected)); + } + return indexed_splits; + } + Status ScanAndRead(const std::string& table_path, const std::vector& read_schema, const std::shared_ptr& expected_array, const std::shared_ptr& predicate = nullptr, @@ -173,13 +205,12 @@ class BlobTableInteTest : public testing::Test, public ::testing::WithParamInter // read auto splits = result_plan->Splits(); ReadContextBuilder read_context_builder(table_path); - read_context_builder.SetReadSchema(read_schema) - .SetPredicate(predicate) - .SetRowRanges(row_ranges); + read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, read_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context))); - PAIMON_ASSIGN_OR_RAISE(auto batch_reader, table_read->CreateReader(splits)); + PAIMON_ASSIGN_OR_RAISE(auto read_splits, CreateReadSplit(splits, row_ranges)); + PAIMON_ASSIGN_OR_RAISE(auto batch_reader, table_read->CreateReader(read_splits)); PAIMON_ASSIGN_OR_RAISE(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); @@ -326,7 +357,7 @@ TEST_P(BlobTableInteTest, TestAppendTableWriteWithBlobAsDescriptorTrue) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto schema_with_row_kind = arrow::schema(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ [0, "str_0", null], @@ -377,7 +408,7 @@ TEST_P(BlobTableInteTest, TestAppendTableWriteWithBlobAsDescriptorFalse) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ [0, "str_0", null, "apple"], diff --git a/test/inte/data_evolution_table_test.cpp b/test/inte/data_evolution_table_test.cpp index 32155bc2..2d00c4bc 100644 --- a/test/inte/data_evolution_table_test.cpp +++ b/test/inte/data_evolution_table_test.cpp @@ -20,9 +20,11 @@ #include "paimon/common/utils/date_time_utils.h" #include "paimon/common/utils/path_util.h" #include "paimon/common/utils/scope_guard.h" +#include "paimon/core/global_index/indexed_split_impl.h" #include "paimon/core/table/source/data_split_impl.h" #include "paimon/defs.h" #include "paimon/fs/file_system.h" +#include "paimon/global_index/indexed_split.h" #include "paimon/predicate/literal.h" #include "paimon/predicate/predicate_builder.h" #include "paimon/result.h" @@ -120,6 +122,36 @@ class DataEvolutionTableTest : public ::testing::Test, return file_store_commit->Commit(commit_msgs); } + Result>> CreateReadSplit( + const std::vector>& splits, + const std::vector& row_ranges) const { + if (row_ranges.empty()) { + return splits; + } + // TODO(xinyu.lxy): mv to DataEvolutionBatchScan + std::vector sorted_row_ranges = + Range::SortAndMergeOverlap(row_ranges, /*adjacent=*/true); + std::vector> indexed_splits; + indexed_splits.reserve(splits.size()); + for (const auto& split : splits) { + auto data_split = std::dynamic_pointer_cast(split); + if (!data_split) { + return Status::Invalid("Cannot cast split to DataSplit when create IndexedSplit"); + } + std::vector file_ranges; + file_ranges.reserve(data_split->DataFiles().size()); + for (const auto& meta : data_split->DataFiles()) { + PAIMON_ASSIGN_OR_RAISE(int64_t first_row_id, meta->NonNullFirstRowId()); + file_ranges.emplace_back(first_row_id, first_row_id + meta->row_count - 1); + } + auto sorted_file_ranges = Range::SortAndMergeOverlap(file_ranges, /*adjacent=*/true); + std::vector expected = Range::And(sorted_file_ranges, sorted_row_ranges); + // TODO(xinyu.lxy): add scores + indexed_splits.push_back(std::make_shared(data_split, expected)); + } + return indexed_splits; + } + Status ScanAndRead(const std::string& table_path, const std::vector& read_schema, const std::shared_ptr& expected_array, const std::shared_ptr& predicate = nullptr, @@ -140,13 +172,12 @@ class DataEvolutionTableTest : public ::testing::Test, // read auto splits = result_plan->Splits(); ReadContextBuilder read_context_builder(table_path); - read_context_builder.SetReadSchema(read_schema) - .SetPredicate(predicate) - .SetRowRanges(row_ranges); + read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, read_context_builder.Finish()); PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context))); - PAIMON_ASSIGN_OR_RAISE(auto batch_reader, table_read->CreateReader(splits)); + PAIMON_ASSIGN_OR_RAISE(auto read_splits, CreateReadSplit(splits, row_ranges)); + PAIMON_ASSIGN_OR_RAISE(auto batch_reader, table_read->CreateReader(read_splits)); PAIMON_ASSIGN_OR_RAISE(auto read_result, ReadResultCollector::CollectResult(batch_reader.get())); @@ -285,6 +316,13 @@ TEST_P(DataEvolutionTableTest, TestBasic) { ASSERT_OK(ScanAndRead(table_path, {"f1", "f0", "_SEQUENCE_NUMBER", "_ROW_ID", "f2"}, expected_row_tracking_array)); + + // read score but not indexed split + ASSERT_NOK_WITH_MSG( + ScanAndRead(table_path, {"f0", "f1", "_INDEX_SCORE"}, expected_row_tracking_array, + /*predicate=*/nullptr, + /*row_ranges=*/{}), + "Invalid read schema, read _INDEX_SCORE while split cannot cast to IndexedSplit"); } } @@ -713,18 +751,17 @@ TEST_P(DataEvolutionTableTest, TestOnlyRowTrackingEnabled) { ASSERT_OK(Commit(table_path, commit_msgs)); { - // test with row ids + // test with row ids, as only data evolution mode support read with row ranges std::vector row_ranges = {Range(1l, 1l)}; - CheckScanResult(table_path, /*predicate=*/nullptr, /*row_ranges=*/row_ranges, - /*expected_first_row_ids=*/{0}, /*expected_row_counts=*/{2}); auto expected_array = std::dynamic_pointer_cast( arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ [2, "c", "d"] ])") .ValueOrDie()); - ASSERT_OK(ScanAndRead(table_path, schema->field_names(), expected_array, - /*predicate=*/nullptr, - /*row_ranges=*/row_ranges)); + ASSERT_NOK_WITH_MSG(ScanAndRead(table_path, schema->field_names(), expected_array, + /*predicate=*/nullptr, + /*row_ranges=*/row_ranges), + "unexpected error, split cast to impl failed"); } if (GetParam() != "lance") { // read with row tracking @@ -1461,21 +1498,28 @@ TEST_P(DataEvolutionTableTest, TestScanAndReadWithIndex) { } { // f2 has bitmap index, data evolution scan will ignore index => not empty plan - // but raw file split read will not ignore index => empty read batch + // data evolution split read will also ignore index => not empty read batch auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(203)); + auto expected_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow_data_type, R"([ + [null, null, 202, 6.1], + [null, null, 204, 7.1] + ])") + .ValueOrDie()); ASSERT_OK(ScanAndRead(table_path, arrow::schema(arrow_data_type->fields())->field_names(), - /*expected_array=*/nullptr, predicate, + expected_array, predicate, /*row_ranges=*/{}, - /*check_scan_plan_when_empty_result=*/false)); + /*check_scan_plan_when_empty_result=*/true)); } { - // f2 has bitmap index, raw file split read will not ignore index + // f2 has bitmap index, data evolution split read will ignore index auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(202)); auto expected_array = std::dynamic_pointer_cast( arrow::ipc::internal::json::ArrayFromJSON(arrow_data_type, R"([ - [null, null, 202, 6.1] + [null, null, 202, 6.1], + [null, null, 204, 7.1] ])") .ValueOrDie()); ASSERT_OK(ScanAndRead(table_path, arrow::schema(arrow_data_type->fields())->field_names(), @@ -1515,13 +1559,14 @@ TEST_P(DataEvolutionTableTest, TestScanAndReadWithIndex) { { // test row id with predicate std::vector row_ranges = {Range(4l, 5l)}; - // row id = {4, 5}, while raw file split read will skip row 4 for bitmap index + // row id = {4, 5}, data evolution split read will ignore bitmap index auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(204)); CheckScanResult(table_path, /*predicate=*/predicate, /*row_ranges=*/row_ranges, /*expected_first_row_ids=*/{4}, /*expected_row_counts=*/{2}); auto expected_array = std::dynamic_pointer_cast( arrow::ipc::internal::json::ArrayFromJSON(arrow_data_type, R"([ + [null, null, 202, 6.1], [null, null, 204, 7.1] ])") .ValueOrDie()); diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp index 0a50f3ac..23adf2f0 100644 --- a/test/inte/global_index_test.cpp +++ b/test/inte/global_index_test.cpp @@ -17,7 +17,9 @@ #include "gtest/gtest.h" #include "paimon/common/factories/io_hook.h" #include "paimon/common/global_index/bitmap/bitmap_global_index_factory.h" +#include "paimon/common/table/special_fields.h" #include "paimon/common/utils/scope_guard.h" +#include "paimon/core/global_index/indexed_split_impl.h" #include "paimon/core/table/source/data_split_impl.h" #include "paimon/defs.h" #include "paimon/fs/file_system.h" @@ -109,7 +111,7 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter return file_store_commit->Commit(commit_msgs); } - Result> ScanData( + Result> ScanData( const std::string& table_path, const std::vector>& partition_filters) const { ScanContextBuilder scan_context_builder(table_path); @@ -118,7 +120,89 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter PAIMON_ASSIGN_OR_RAISE(auto table_scan, TableScan::Create(std::move(scan_context))); PAIMON_ASSIGN_OR_RAISE(auto result_plan, table_scan->CreatePlan()); EXPECT_EQ(result_plan->Splits().size(), 1); - return result_plan->Splits()[0]; + return std::dynamic_pointer_cast(result_plan->Splits()[0]); + } + + Result>> CreateReadSplit( + const std::vector>& splits, const std::vector& row_ranges, + std::map id_to_score) const { + if (row_ranges.empty()) { + return splits; + } + // TODO(xinyu.lxy): mv to DataEvolutionBatchScan + std::vector sorted_row_ranges = + Range::SortAndMergeOverlap(row_ranges, /*adjacent=*/true); + std::vector> indexed_splits; + indexed_splits.reserve(splits.size()); + for (const auto& split : splits) { + auto data_split = std::dynamic_pointer_cast(split); + if (!data_split) { + return Status::Invalid("Cannot cast split to DataSplit when create IndexedSplit"); + } + std::vector file_ranges; + file_ranges.reserve(data_split->DataFiles().size()); + for (const auto& meta : data_split->DataFiles()) { + PAIMON_ASSIGN_OR_RAISE(int64_t first_row_id, meta->NonNullFirstRowId()); + file_ranges.emplace_back(first_row_id, first_row_id + meta->row_count - 1); + } + auto sorted_file_ranges = Range::SortAndMergeOverlap(file_ranges, /*adjacent=*/true); + std::vector expected = Range::And(sorted_file_ranges, sorted_row_ranges); + std::vector scores; + if (!id_to_score.empty()) { + for (const auto& range : expected) { + for (int64_t i = range.from; i <= range.to; i++) { + scores.push_back(id_to_score[i]); + } + } + } + indexed_splits.push_back( + std::make_shared(data_split, expected, scores)); + } + return indexed_splits; + } + + Status ScanAndRead(const std::string& table_path, const std::vector& read_schema, + const std::shared_ptr& expected_array, + const std::shared_ptr& predicate, + const std::vector& row_ranges, + const std::map& id_to_score) const { + // scan + ScanContextBuilder scan_context_builder(table_path); + scan_context_builder.SetPredicate(predicate).SetRowRanges(row_ranges); + PAIMON_ASSIGN_OR_RAISE(auto scan_context, scan_context_builder.Finish()); + PAIMON_ASSIGN_OR_RAISE(auto table_scan, TableScan::Create(std::move(scan_context))); + PAIMON_ASSIGN_OR_RAISE(auto result_plan, table_scan->CreatePlan()); + if (!expected_array) { + if (!result_plan->Splits().empty()) { + return Status::Invalid("check_scan_plan_when_empty_result but plan is not empty"); + } + } + + // read + auto splits = result_plan->Splits(); + ReadContextBuilder read_context_builder(table_path); + read_context_builder.SetReadSchema(read_schema).SetPredicate(predicate); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr read_context, + read_context_builder.Finish()); + PAIMON_ASSIGN_OR_RAISE(auto table_read, TableRead::Create(std::move(read_context))); + PAIMON_ASSIGN_OR_RAISE(auto read_splits, CreateReadSplit(splits, row_ranges, id_to_score)); + PAIMON_ASSIGN_OR_RAISE(auto batch_reader, table_read->CreateReader(read_splits)); + PAIMON_ASSIGN_OR_RAISE(auto read_result, + ReadResultCollector::CollectResult(batch_reader.get())); + + if (!expected_array) { + if (read_result) { + return Status::Invalid("expected array is empty, but read result is not empty"); + } + return Status::OK(); + } + auto expected_chunk_array = std::make_shared(expected_array); + if (!expected_chunk_array->ApproxEquals(*read_result)) { + std::cout << "result=" << read_result->ToString() << std::endl + << "expected=" << expected_chunk_array->ToString() << std::endl; + return Status::Invalid("expected array and result array not equal"); + } + return Status::OK(); } private: @@ -168,7 +252,9 @@ TEST_P(GlobalIndexTest, TestWriteLuminaIndex) { ASSERT_OK_AND_ASSIGN(auto split, ScanData(table_path, /*partition_filters=*/{})); ASSERT_OK_AND_ASSIGN(auto index_commit_msg, RowRangeGlobalIndexWriter::WriteIndex( - table_path, "f1", "lumina", split, Range(0, 3), + table_path, "f1", "lumina", + std::make_shared( + split, std::vector({Range(0, 3)})), /*options=*/lumina_options, pool_)); auto index_commit_msg_impl = std::dynamic_pointer_cast(index_commit_msg); ASSERT_TRUE(index_commit_msg_impl); @@ -210,9 +296,11 @@ TEST_P(GlobalIndexTest, TestWriteIndex) { ASSERT_OK(Commit(table_path, commit_msgs)); ASSERT_OK_AND_ASSIGN(auto split, ScanData(table_path, /*partition_filters=*/{})); - ASSERT_OK_AND_ASSIGN(auto index_commit_msg, - RowRangeGlobalIndexWriter::WriteIndex(table_path, "f0", "bitmap", split, - Range(0, 7), /*options=*/{}, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_commit_msg, RowRangeGlobalIndexWriter::WriteIndex( + table_path, "f0", "bitmap", + std::make_shared( + split, std::vector({Range(0, 7)})), + /*options=*/{}, pool_)); auto index_commit_msg_impl = std::dynamic_pointer_cast(index_commit_msg); ASSERT_TRUE(index_commit_msg_impl); @@ -264,9 +352,12 @@ TEST_P(GlobalIndexTest, TestWriteIndexWithPartition) { [&](const std::vector>& partition, const Range& expected_range, const BinaryRow& expected_partition_row) { ASSERT_OK_AND_ASSIGN(auto split, ScanData(table_path, partition)); - ASSERT_OK_AND_ASSIGN(auto index_commit_msg, RowRangeGlobalIndexWriter::WriteIndex( - table_path, "f0", "bitmap", split, - expected_range, /*options=*/{}, pool_)); + ASSERT_OK_AND_ASSIGN( + auto index_commit_msg, + RowRangeGlobalIndexWriter::WriteIndex( + table_path, "f0", "bitmap", + std::make_shared(split, std::vector({expected_range})), + /*options=*/{}, pool_)); auto index_commit_msg_impl = std::dynamic_pointer_cast(index_commit_msg); ASSERT_TRUE(index_commit_msg_impl); @@ -308,8 +399,8 @@ TEST_P(GlobalIndexTest, TestScanIndex) { GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, /*file_system=*/nullptr, pool_)); - ASSERT_OK_AND_ASSIGN(std::set ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::set({Range(0, 7)})); + ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); + ASSERT_EQ(ranges, std::vector({Range(0, 7)})); ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); // test index reader // test f0 field @@ -458,8 +549,8 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { GlobalIndexScan::Create(table_path, /*snapshot_id=*/2l, /*partitions=*/std::nullopt, /*options=*/{}, /*file_system=*/nullptr, pool_)); - ASSERT_OK_AND_ASSIGN(std::set ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::set({Range(0, 7)})); + ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); + ASSERT_EQ(ranges, std::vector({Range(0, 7)})); ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); // test index reader // test f0 field @@ -509,7 +600,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshotWithNoIndex) { GlobalIndexScan::Create(table_path, /*snapshot_id=*/1l, /*partitions=*/std::nullopt, /*options=*/{}, /*file_system=*/nullptr, pool_)); - ASSERT_OK_AND_ASSIGN(std::set ranges, global_index_scan->GetRowRangeList()); + ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); ASSERT_TRUE(ranges.empty()); ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); @@ -537,8 +628,8 @@ TEST_P(GlobalIndexTest, TestScanIndexWithRange) { GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, /*file_system=*/nullptr, pool_)); - ASSERT_OK_AND_ASSIGN(std::set ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::set({Range(0, 7)})); + ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); + ASSERT_EQ(ranges, std::vector({Range(0, 7)})); { ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 3))); // test index reader @@ -585,8 +676,8 @@ TEST_P(GlobalIndexTest, TestScanIndexWithPartition) { GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, /*options=*/{}, /*file_system=*/nullptr, pool_)); - ASSERT_OK_AND_ASSIGN(std::set ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::set({Range(0, 4)})); + ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); + ASSERT_EQ(ranges, std::vector({Range(0, 4)})); ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 4))); // test index reader @@ -680,17 +771,19 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndex) { ASSERT_OK(Commit(table_path, commit_msgs)); ASSERT_OK_AND_ASSIGN(auto split, ScanData(table_path, /*partition_filters=*/{})); - ASSERT_OK_AND_ASSIGN(auto index_commit_msg, - RowRangeGlobalIndexWriter::WriteIndex(table_path, "f0", "bitmap", split, - Range(0, 7), /*options=*/{}, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_commit_msg, RowRangeGlobalIndexWriter::WriteIndex( + table_path, "f0", "bitmap", + std::make_shared( + split, std::vector({Range(0, 7)})), + /*options=*/{}, pool_)); ASSERT_OK(Commit(table_path, {index_commit_msg})); ASSERT_OK_AND_ASSIGN(auto global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::nullopt, /*options=*/{}, /*file_system=*/nullptr, pool_)); - ASSERT_OK_AND_ASSIGN(std::set ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::set({Range(0, 7)})); + ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); + ASSERT_EQ(ranges, std::vector({Range(0, 7)})); ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); ASSERT_OK_AND_ASSIGN(auto index_result, @@ -728,14 +821,20 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { ASSERT_OK_AND_ASSIGN(auto split, ScanData(table_path, /*partition_filters=*/{partition})); // write bitmap index - ASSERT_OK_AND_ASSIGN(auto bitmap_commit_msg, RowRangeGlobalIndexWriter::WriteIndex( - table_path, "f0", "bitmap", split, - expected_range, /*options=*/{}, pool_)); + ASSERT_OK_AND_ASSIGN( + auto bitmap_commit_msg, + RowRangeGlobalIndexWriter::WriteIndex( + table_path, "f0", "bitmap", + std::make_shared(split, std::vector({expected_range})), + /*options=*/{}, pool_)); ASSERT_OK(Commit(table_path, {bitmap_commit_msg})); // write and commit lumina index - ASSERT_OK_AND_ASSIGN(auto lumina_commit_msg, RowRangeGlobalIndexWriter::WriteIndex( - table_path, "f1", "lumina", split, - expected_range, lumina_options, pool_)); + ASSERT_OK_AND_ASSIGN( + auto lumina_commit_msg, + RowRangeGlobalIndexWriter::WriteIndex( + table_path, "f1", "lumina", + std::make_shared(split, std::vector({expected_range})), + lumina_options, pool_)); ASSERT_OK(Commit(table_path, {lumina_commit_msg})); }; @@ -764,15 +863,17 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { auto scan_and_check_result = [&](const std::map& partition, const Range& expected_range, - GlobalIndexReader::TopKPreFilter filter, int32_t k, const std::string bitmap_result, - const std::string lumina_result) { + GlobalIndexReader::TopKPreFilter filter, int32_t k, const std::string& bitmap_result, + const std::string& lumina_result, const std::vector& read_row_ranges, + const std::shared_ptr& expected_array, + const std::map& id_to_score) { std::vector> partitions = {partition}; ASSERT_OK_AND_ASSIGN(auto global_index_scan, GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, lumina_options, /*file_system=*/nullptr, pool_)); - ASSERT_OK_AND_ASSIGN(std::set ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::set({expected_range})); + ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); + ASSERT_EQ(ranges, std::vector({expected_range})); ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(expected_range)); @@ -799,15 +900,44 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { ASSERT_OK_AND_ASSIGN(auto topk_result, lumina_reader->VisitTopK(k, query, filter, /*predicate*/ nullptr)); ASSERT_EQ(topk_result->ToString(), lumina_result); - }; - auto filter1 = [](int64_t id) -> bool { return id == 0; }; - scan_and_check_result({{"f2", "10"}}, Range(0, 3), filter1, /*k=*/2, "{0}", - "row ids: {0}, scores: {4.21}"); - auto filter2 = [](int64_t id) -> bool { return id == 7 || id == 8; }; - scan_and_check_result({{"f2", "20"}}, Range(4, 8), filter2, /*k=*/1, "{7,8}", - "row ids: {8}, scores: {322.21}"); + // check read array + std::vector read_field_names = schema->field_names(); + read_field_names.push_back("_INDEX_SCORE"); + ASSERT_OK(ScanAndRead(table_path, read_field_names, expected_array, + /*predicate=*/nullptr, read_row_ranges, id_to_score)); + }; + auto result_fields = fields; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + result_fields.push_back(SpecialFields::IndexScore().ArrowField()); + std::map id_to_score = {{0, 4.21f}, {1, 2.01f}, {2, 2.21f}, + {3, 0.01f}, {4, 322.21f}, {5, 360.01f}, + {6, 360.21f}, {7, 398.01}, {8, 322.21f}}; + { + // test scan and read for f2=10 + auto filter = [](int64_t id) -> bool { return id == 0; }; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] + ])") + .ValueOrDie(); + scan_and_check_result({{"f2", "10"}}, Range(0, 3), filter, /*k=*/2, "{0}", + "row ids: {0}, scores: {4.21}", {Range(0, 0)}, expected_array, + id_to_score); + } + { + // test scan and read for f2=20 + auto filter = [](int64_t id) -> bool { return id == 7 || id == 8; }; + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1, 322.21] + ])") + .ValueOrDie(); + scan_and_check_result({{"f2", "20"}}, Range(4, 8), filter, /*k=*/1, "{7,8}", + "row ids: {8}, scores: {322.21}", {Range(8, 8)}, expected_array, + id_to_score); + } { // test invalid range input ASSERT_OK_AND_ASSIGN(auto global_index_scan, @@ -819,6 +949,117 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { } } +TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithScore) { + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), + arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; + std::map lumina_options = { + {"lumina.dimension", "4"}, + {"lumina.indextype", "bruteforce"}, + {"lumina.distance.metric", "l2"}, + {"lumina.encoding.type", "encoding.rawf32"}, + {"lumina.search.threadcount", "10"}}; + auto schema = arrow::schema(fields); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, GetParam()}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + auto src_array = std::dynamic_pointer_cast( + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ +["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], +["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], +["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1], +["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], +["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], +["Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], +["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], +["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] + ])") + .ValueOrDie()); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + ASSERT_OK_AND_ASSIGN(auto split, ScanData(table_path, /*partition_filters=*/{})); + + // write and commit lumina index + ASSERT_OK_AND_ASSIGN(auto lumina_commit_msg, RowRangeGlobalIndexWriter::WriteIndex( + table_path, "f1", "lumina", + std::make_shared( + split, std::vector({Range(0, 8)})), + lumina_options, pool_)); + ASSERT_OK(Commit(table_path, {lumina_commit_msg})); + + auto scan_and_check_result = [&](const std::vector& read_row_ranges, + const std::shared_ptr& expected_array, + const std::map& id_to_score) { + // check read array + std::vector read_field_names = schema->field_names(); + read_field_names.push_back("_INDEX_SCORE"); + ASSERT_OK(ScanAndRead(table_path, read_field_names, expected_array, + /*predicate=*/nullptr, read_row_ranges, id_to_score)); + }; + + auto result_fields = fields; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + result_fields.push_back(SpecialFields::IndexScore().ArrowField()); + std::map id_to_score = {{0, 4.21f}, {1, 2.01f}, {2, 2.21f}, + {3, 0.01f}, {4, 322.21f}, {5, 360.01f}, + {6, 360.21f}, {7, 398.01}, {8, 322.21f}}; + { + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21], +[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1, 2.01], +[0, "Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1, 2.21], +[0, "Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1, 0.01], +[0, "Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1, 322.21], +[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1, 360.01], +[0, "Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1, 360.21], +[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1, 398.01], +[0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1, 322.21] + ])") + .ValueOrDie(); + scan_and_check_result({Range(0, 8)}, expected_array, id_to_score); + } + { + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1, 2.21], +[0, "Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1, 0.01], +[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1, 398.01], +[0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1, 322.21] + ])") + .ValueOrDie(); + scan_and_check_result({Range(2, 3), Range(7, 8)}, expected_array, id_to_score); + } + { + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1, 360.01] + ])") + .ValueOrDie(); + scan_and_check_result({Range(5, 5)}, expected_array, id_to_score); + } + { + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1, null], +[0, "Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1, null], +[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1, null], +[0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1, null] + ])") + .ValueOrDie(); + scan_and_check_result({Range(2, 3), Range(7, 8)}, expected_array, /*id_to_score=*/{}); + } +} + std::vector GetTestValuesForGlobalIndexTest() { std::vector values = {"parquet"}; #ifdef PAIMON_ENABLE_ORC diff --git a/test/inte/read_inte_test.cpp b/test/inte/read_inte_test.cpp index 74430bed..39d49fac 100644 --- a/test/inte/read_inte_test.cpp +++ b/test/inte/read_inte_test.cpp @@ -90,8 +90,8 @@ class ReadInteTest : public testing::Test, public ::testing::WithParamInterface< std::vector, std::vector, std::vector>>>; - std::vector> CreateDataSplits( - const DataSplitsSimple& input_data_splits, int64_t snapshot_id) const { + std::vector> CreateDataSplits(const DataSplitsSimple& input_data_splits, + int64_t snapshot_id) const { DataSplitsSchemaDv results; results.reserve(input_data_splits.size()); @@ -105,8 +105,8 @@ class ReadInteTest : public testing::Test, public ::testing::WithParamInterface< return CreateDataSplits(results, snapshot_id); } - std::vector> CreateDataSplits(const DataSplitsDv& input_data_splits, - int64_t snapshot_id) const { + std::vector> CreateDataSplits(const DataSplitsDv& input_data_splits, + int64_t snapshot_id) const { DataSplitsSchemaDv results; results.reserve(input_data_splits.size()); for (const auto& input_data_split : input_data_splits) { @@ -119,8 +119,8 @@ class ReadInteTest : public testing::Test, public ::testing::WithParamInterface< return CreateDataSplits(results, snapshot_id); } - std::vector> CreateDataSplits( - const DataSplitsSchema& input_data_splits, int64_t snapshot_id) const { + std::vector> CreateDataSplits(const DataSplitsSchema& input_data_splits, + int64_t snapshot_id) const { DataSplitsSchemaDv results; results.reserve(input_data_splits.size()); for (const auto& input_data_split : input_data_splits) { @@ -133,9 +133,9 @@ class ReadInteTest : public testing::Test, public ::testing::WithParamInterface< return CreateDataSplits(results, snapshot_id); } - std::vector> CreateDataSplits( + std::vector> CreateDataSplits( const DataSplitsSchemaDv& input_data_splits, int64_t snapshot_id) const { - std::vector> data_splits; + std::vector> data_splits; for (const auto& input_data_split : input_data_splits) { std::vector> data_file_metas; const auto& bucket_path = std::get<0>(input_data_split); @@ -174,7 +174,7 @@ class ReadInteTest : public testing::Test, public ::testing::WithParamInterface< return data_splits; } - std::shared_ptr GetDataSplitFromFile(const std::string& split_file_name) { + std::shared_ptr GetDataSplitFromFile(const std::string& split_file_name) { auto file_system = std::make_shared(); EXPECT_OK_AND_ASSIGN(auto input_stream, file_system->Open(split_file_name)); std::vector split_bytes(input_stream->Length().value_or(0), 0); @@ -182,9 +182,9 @@ class ReadInteTest : public testing::Test, public ::testing::WithParamInterface< input_stream->Read(split_bytes.data(), split_bytes.size())); EXPECT_OK(input_stream->Close()); - EXPECT_OK_AND_ASSIGN(auto split, DataSplit::Deserialize((char*)split_bytes.data(), - split_bytes.size(), pool_)); - return split; + EXPECT_OK_AND_ASSIGN( + auto split, Split::Deserialize((char*)split_bytes.data(), split_bytes.size(), pool_)); + return std::dynamic_pointer_cast(split); } private: @@ -394,7 +394,7 @@ TEST_P(ReadInteTest, TestReadOnlyPartitionField) { ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - std::vector> data_splits; + std::vector> data_splits; data_splits.reserve(3); for (size_t i = 0; i < 3; ++i) { std::string file_name = path + "/data-splits/data_split-" + std::to_string(i); @@ -2005,7 +2005,7 @@ TEST_P(ReadInteTest, TestReadWithPKFallBackBranch) { auto param = GetParam(); std::string path = paimon::test::GetDataDir() + "/" + param.file_format + "/append_table_with_rt_branch.db/append_table_with_rt_branch"; - std::vector> data_splits; + std::vector> data_splits; data_splits.reserve(3); for (size_t i = 0; i < 3; ++i) { std::string file_name = path + "/data-splits/data_split-" + std::to_string(i); @@ -2069,7 +2069,7 @@ TEST_P(ReadInteTest, TestReadWithAppendFallBackBranch) { auto param = GetParam(); std::string path = paimon::test::GetDataDir() + "/" + param.file_format + "/append_table_with_append_pt_branch.db/append_table_with_append_pt_branch"; - std::vector> data_splits; + std::vector> data_splits; data_splits.reserve(2); for (size_t i = 0; i < 2; ++i) { std::string file_name = path + "/data-splits/data_split-" + std::to_string(i); @@ -2147,7 +2147,7 @@ TEST_P(ReadInteTest, TestReadWithPKRtBranch) { auto param = GetParam(); std::string path = paimon::test::GetDataDir() + "/" + param.file_format + "/append_table_with_rt_branch.db/append_table_with_rt_branch"; - std::vector> data_splits; + std::vector> data_splits; data_splits.reserve(4); for (size_t i = 0; i < 4; ++i) { std::string file_name = path + "/data-splits/data_split-rt-" + std::to_string(i); @@ -2204,7 +2204,7 @@ TEST_P(ReadInteTest, TestReadWithAppendPtBranch) { auto param = GetParam(); std::string path = paimon::test::GetDataDir() + "/" + param.file_format + "/append_table_with_append_pt_branch.db/append_table_with_append_pt_branch"; - std::vector> data_splits; + std::vector> data_splits; for (size_t i = 0; i < 1; ++i) { std::string file_name = path + "/data-splits/data_split-pt-" + std::to_string(i); auto split = GetDataSplitFromFile(file_name); diff --git a/test/inte/read_inte_with_index_test.cpp b/test/inte/read_inte_with_index_test.cpp index e0a0ab51..ed921123 100644 --- a/test/inte/read_inte_with_index_test.cpp +++ b/test/inte/read_inte_with_index_test.cpp @@ -74,7 +74,7 @@ class ReadInteWithIndexTest : public testing::Test, void TearDown() override {} void CheckResult(const std::string& table_path, - const std::vector> data_splits, + const std::vector> splits, const std::shared_ptr& predicate, const std::shared_ptr& expected_array) const { auto [file_format, enable_prefetch] = GetParam(); @@ -87,7 +87,7 @@ class ReadInteWithIndexTest : public testing::Test, } ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_splits)); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(splits)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult(batch_reader.get())); @@ -104,7 +104,7 @@ class ReadInteWithIndexTest : public testing::Test, void CheckResultForBitmap(const std::string& path, const std::shared_ptr& arrow_data_type, - const std::shared_ptr data_split) const { + const std::shared_ptr split) const { { // test with non predicate std::shared_ptr expected_array; @@ -121,7 +121,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, /*predicate=*/nullptr, expected_array); + CheckResult(path, {split}, /*predicate=*/nullptr, expected_array); } { // test equal predicate for f0 @@ -136,7 +136,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test not equal predicate for f0 @@ -155,7 +155,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test equal predicate for f1 @@ -170,7 +170,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test equal predicate for f2 @@ -186,7 +186,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test is null predicate @@ -199,7 +199,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test is not null predicate @@ -218,7 +218,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test in predicate @@ -237,7 +237,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test not in predicate @@ -254,7 +254,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test and predicate @@ -272,7 +272,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test or predicate @@ -293,19 +293,19 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test predicate push down auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(30)); - CheckResult(path, {data_split}, predicate, /*expected_array=*/nullptr); + CheckResult(path, {split}, predicate, /*expected_array=*/nullptr); } { // test non-result auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(30)); - CheckResult(path, {data_split}, predicate, /*expected_array=*/nullptr); + CheckResult(path, {split}, predicate, /*expected_array=*/nullptr); } { // test early stopping @@ -319,15 +319,15 @@ class ReadInteWithIndexTest : public testing::Test, ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f1_predicate, f2_predicate, f0_predicate})); - CheckResult(path, {data_split}, predicate, /*expected_array=*/nullptr); + CheckResult(path, {split}, predicate, /*expected_array=*/nullptr); } } void CheckResultForBitmapWithSingleRowGroup( const std::string& path, const std::shared_ptr& arrow_data_type, - const std::shared_ptr data_split) const { + const std::shared_ptr split) const { // test bitmap index takes effective - CheckResultForBitmap(path, arrow_data_type, data_split); + CheckResultForBitmap(path, arrow_data_type, split); // test no index take effective { @@ -348,7 +348,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test predicate on f3 (do not have index) @@ -368,13 +368,13 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } void CheckResultForBsi(const std::string& path, const std::shared_ptr& arrow_data_type, - const std::shared_ptr data_split) const { + const std::shared_ptr split) const { { // test with non predicate std::shared_ptr expected_array; @@ -391,7 +391,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, /*predicate=*/nullptr, expected_array); + CheckResult(path, {split}, /*predicate=*/nullptr, expected_array); } { // test is null predicate for f4 @@ -404,7 +404,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test equal predicate for f1 @@ -418,7 +418,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test not equal predicate for f2 @@ -435,7 +435,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test greater than predicate for f1 @@ -450,7 +450,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test greater or equal predicate for f2 @@ -466,7 +466,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test less than predicate for f4 @@ -483,7 +483,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test less or equal predicate for f4, as timestamp is normalized to long (micros), @@ -503,7 +503,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test in for f2 @@ -520,7 +520,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test not in for f1 @@ -537,7 +537,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test and predicate @@ -555,7 +555,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test or predicate @@ -579,7 +579,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test and predicate for is not null @@ -602,7 +602,7 @@ class ReadInteWithIndexTest : public testing::Test, ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } @@ -655,7 +655,7 @@ TEST_P(ReadInteWithIndexTest, TestSimple) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); auto predicate = @@ -679,7 +679,7 @@ TEST_P(ReadInteWithIndexTest, TestSimple) { } ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_split)); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(split)); ASSERT_OK_AND_ASSIGN(auto result_array, ReadResultCollector::CollectResult(batch_reader.get())); ASSERT_TRUE(result_array); ASSERT_TRUE(result_array->Equals(*expected_array)); @@ -729,7 +729,7 @@ TEST_P(ReadInteWithIndexTest, TestReadWithLimits) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); auto predicate = @@ -746,7 +746,7 @@ TEST_P(ReadInteWithIndexTest, TestReadWithLimits) { } ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); ASSERT_OK_AND_ASSIGN(auto table_read, TableRead::Create(std::move(read_context))); - ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(data_split)); + ASSERT_OK_AND_ASSIGN(auto batch_reader, table_read->CreateReader(split)); // simulate read limits, only read 3 batches for (int32_t i = 0; i < 3; i++) { ASSERT_OK_AND_ASSIGN(BatchReader::ReadBatch batch, batch_reader->NextBatch()); @@ -833,9 +833,9 @@ TEST_P(ReadInteWithIndexTest, TestEmbeddingBitmapIndex) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); - CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, data_split); + CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, split); } TEST_P(ReadInteWithIndexTest, TestBitmapWithV1) { @@ -896,9 +896,9 @@ TEST_P(ReadInteWithIndexTest, TestBitmapWithV1) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); - CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, data_split); + CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, split); } TEST_P(ReadInteWithIndexTest, TestNoEmbeddingBitmapIndex) { @@ -935,9 +935,9 @@ TEST_P(ReadInteWithIndexTest, TestNoEmbeddingBitmapIndex) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); - CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, data_split); + CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, split); } TEST_P(ReadInteWithIndexTest, TestNoEmbeddingBitmapIndexWithExternalPath) { @@ -981,9 +981,9 @@ TEST_P(ReadInteWithIndexTest, TestNoEmbeddingBitmapIndexWithExternalPath) { /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); - CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, data_split); + CheckResultForBitmapWithSingleRowGroup(path, arrow_data_type, split); } TEST_P(ReadInteWithIndexTest, TestBitmapIndexWithDv) { @@ -1025,11 +1025,11 @@ TEST_P(ReadInteWithIndexTest, TestBitmapIndexWithDv) { /*offset=*/1, /*length=*/24, /*cardinality=*/2); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, builder.WithSnapshot(4) - .WithDataDeletionFiles({deletion_file}) - .IsStreaming(false) - .RawConvertible(true) - .Build()); + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(4) + .WithDataDeletionFiles({deletion_file}) + .IsStreaming(false) + .RawConvertible(true) + .Build()); { // test with non predicate std::shared_ptr expected_array; @@ -1043,7 +1043,7 @@ TEST_P(ReadInteWithIndexTest, TestBitmapIndexWithDv) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, /*predicate=*/nullptr, expected_array); + CheckResult(path, {split}, /*predicate=*/nullptr, expected_array); } { // test equal, Alice with key 0 is removed by dv @@ -1056,14 +1056,14 @@ TEST_P(ReadInteWithIndexTest, TestBitmapIndexWithDv) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test equal, Lucy is removed by dv auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Lucy", 4)); - CheckResult(path, {data_split}, predicate, /*expected_array=*/nullptr); + CheckResult(path, {split}, predicate, /*expected_array=*/nullptr); } { // test or predicate @@ -1083,7 +1083,7 @@ TEST_P(ReadInteWithIndexTest, TestBitmapIndexWithDv) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } @@ -1185,7 +1185,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "/bucket-0/", {data_file_meta1, data_file_meta2}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(2).IsStreaming(false).RawConvertible(true).Build()); { // test with non predicate @@ -1206,7 +1206,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, /*predicate=*/nullptr, expected_array); + CheckResult(path, {split}, /*predicate=*/nullptr, expected_array); } { // test equal predicate for f1 @@ -1223,7 +1223,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test not equal predicate for f1 @@ -1240,7 +1240,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test equal predicate for f4 @@ -1255,7 +1255,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test not equal predicate for f4 @@ -1276,7 +1276,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test equal predicate for f3, only do predicate push down @@ -1295,7 +1295,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test is null predicate for f5 @@ -1315,7 +1315,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test is_not_null predicate for f5 @@ -1339,7 +1339,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test greater than predicate for f1, do not take effective in bitmap index @@ -1362,7 +1362,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test in predicate @@ -1382,7 +1382,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test not in predicate @@ -1398,7 +1398,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test and predicate @@ -1414,7 +1414,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test or predicate @@ -1433,7 +1433,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test early stop @@ -1446,13 +1446,13 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { PredicateBuilder::IsNotNull(/*field_index=*/3, /*field_name=*/"f5", FieldType::INT); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f4_predicate, f1_predicate, f5_predicate})); - CheckResult(path, {data_split}, predicate, /*expected_array=*/nullptr); + CheckResult(path, {split}, predicate, /*expected_array=*/nullptr); } { // test non result auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f1", FieldType::BIGINT, Literal(40l)); - CheckResult(path, {data_split}, predicate, /*expected_array=*/nullptr); + CheckResult(path, {split}, predicate, /*expected_array=*/nullptr); } { auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f1", @@ -1474,7 +1474,7 @@ TEST_P(ReadInteWithIndexTest, TestWithAlterTable) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } @@ -1512,7 +1512,7 @@ TEST_P(ReadInteWithIndexTest, TestWithBsiIndex) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); { // test equal predicate for f0, take no effective as bsi does not support string @@ -1532,9 +1532,9 @@ TEST_P(ReadInteWithIndexTest, TestWithBsiIndex) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } - CheckResultForBsi(path, arrow_data_type, {data_split}); + CheckResultForBsi(path, arrow_data_type, {split}); } TEST_P(ReadInteWithIndexTest, TestWithBloomFilterIndex) { @@ -1572,7 +1572,7 @@ TEST_P(ReadInteWithIndexTest, TestWithBloomFilterIndex) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); std::shared_ptr all_array; @@ -1590,123 +1590,123 @@ TEST_P(ReadInteWithIndexTest, TestWithBloomFilterIndex) { ASSERT_TRUE(array_status.ok()); { // test with non predicate - CheckResult(path, {data_split}, /*predicate=*/nullptr, all_array); + CheckResult(path, {split}, /*predicate=*/nullptr, all_array); } { // test equal predicate for f0 auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test equal predicate for f0, where literal does not exist auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice2", 6)); - CheckResult(path, {data_split}, predicate, nullptr); + CheckResult(path, {split}, predicate, nullptr); } { // test equal predicate for f1 auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(200)); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test equal predicate for f1, where literal does not exist auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(201)); - CheckResult(path, {data_split}, predicate, nullptr); + CheckResult(path, {split}, predicate, nullptr); } { // test equal predicate for f2 auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(-1)); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test equal predicate for f2, where literal does not exist auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(0)); - CheckResult(path, {data_split}, predicate, nullptr); + CheckResult(path, {split}, predicate, nullptr); } { // test equal predicate for f3 auto predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(13.1)); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test equal predicate for f3, where literal does not exist auto predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(13.2)); - CheckResult(path, {data_split}, predicate, nullptr); + CheckResult(path, {split}, predicate, nullptr); } { // test equal predicate for f4 auto predicate = PredicateBuilder::Equal(/*field_index=*/4, /*field_name=*/"f4", FieldType::TIMESTAMP, Literal(Timestamp(1745542902000l, 123000))); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test equal predicate for f4, where literal does not exist auto predicate = PredicateBuilder::Equal(/*field_index=*/4, /*field_name=*/"f4", FieldType::TIMESTAMP, Literal(Timestamp(1745542502000l, 123000))); - CheckResult(path, {data_split}, predicate, nullptr); + CheckResult(path, {split}, predicate, nullptr); } { // test not equal predicate for f1 auto predicate = PredicateBuilder::NotEqual(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(200)); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test is null predicate for f2 auto predicate = PredicateBuilder::IsNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test is not null predicate for f2 auto predicate = PredicateBuilder::IsNotNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test greater than predicate for f1 auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(200)); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test in for f2 auto predicate = PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, {Literal(-1), Literal(2), Literal(100)}); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test in for f2, where literals do not exist auto predicate = PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, {Literal(-1000), Literal(0), Literal(1000)}); - CheckResult(path, {data_split}, predicate, nullptr); + CheckResult(path, {split}, predicate, nullptr); } { // test not in for f3 auto predicate = PredicateBuilder::NotIn(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, {Literal(11.1), Literal(12.1), Literal(13.1)}); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test not in for f3, where literals do not exist auto predicate = PredicateBuilder::NotIn(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, {Literal(11.12), Literal(12.12), Literal(13.12)}); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test and predicate @@ -1716,7 +1716,7 @@ TEST_P(ReadInteWithIndexTest, TestWithBloomFilterIndex) { PredicateBuilder::Equal(/*field_index=*/4, /*field_name=*/"f4", FieldType::TIMESTAMP, Literal(Timestamp(1745542902000l, 123000))); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f1_predicate, f4_predicate})); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } { // test and predicate @@ -1726,7 +1726,7 @@ TEST_P(ReadInteWithIndexTest, TestWithBloomFilterIndex) { PredicateBuilder::Equal(/*field_index=*/4, /*field_name=*/"f4", FieldType::TIMESTAMP, Literal(Timestamp(-1728l, 123000))); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f1_predicate, f4_predicate})); - CheckResult(path, {data_split}, predicate, nullptr); + CheckResult(path, {split}, predicate, nullptr); } { // test or predicate @@ -1736,7 +1736,7 @@ TEST_P(ReadInteWithIndexTest, TestWithBloomFilterIndex) { PredicateBuilder::Equal(/*field_index=*/4, /*field_name=*/"f4", FieldType::TIMESTAMP, Literal(Timestamp(-1728l, 123000))); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({f1_predicate, f4_predicate})); - CheckResult(path, {data_split}, predicate, all_array); + CheckResult(path, {split}, predicate, all_array); } } @@ -1778,11 +1778,11 @@ TEST_P(ReadInteWithIndexTest, TestBitmapPushDownWithMultiStripes) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); // test bitmap index takes effective - CheckResultForBitmap(path, arrow_data_type, data_split); + CheckResultForBitmap(path, arrow_data_type, split); // test predicate push down takes effective { @@ -1798,7 +1798,7 @@ TEST_P(ReadInteWithIndexTest, TestBitmapPushDownWithMultiStripes) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test predicate on f3 (do not have index), but predicates can be pushdown @@ -1810,7 +1810,7 @@ TEST_P(ReadInteWithIndexTest, TestBitmapPushDownWithMultiStripes) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test and predicate, although the bitmap index cannot handle the LessThan @@ -1827,7 +1827,7 @@ TEST_P(ReadInteWithIndexTest, TestBitmapPushDownWithMultiStripes) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test or predicate, although the bitmap index cannot handle the LessOrEqual @@ -1849,7 +1849,7 @@ TEST_P(ReadInteWithIndexTest, TestBitmapPushDownWithMultiStripes) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } @@ -1888,10 +1888,10 @@ TEST_P(ReadInteWithIndexTest, TestWithBitmapAndBsiAndBloomFilterIndex) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); - CheckResultForBsi(path, arrow_data_type, data_split); + CheckResultForBsi(path, arrow_data_type, split); { // test equal predicate for f3, only bloom filter take effective auto predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", @@ -1909,13 +1909,13 @@ TEST_P(ReadInteWithIndexTest, TestWithBitmapAndBsiAndBloomFilterIndex) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test equal predicate for f3, only bloom filter take effective auto predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(14.13)); - CheckResult(path, {data_split}, predicate, /*expected_array=*/nullptr); + CheckResult(path, {split}, predicate, /*expected_array=*/nullptr); } { // test equal predicate for f0, bitmap index takes effective @@ -1929,7 +1929,7 @@ TEST_P(ReadInteWithIndexTest, TestWithBitmapAndBsiAndBloomFilterIndex) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } @@ -1970,7 +1970,7 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); { // only bitmap is registered @@ -1999,7 +1999,7 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test and predicate, only f1_equals takes effective @@ -2017,7 +2017,7 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } { @@ -2048,7 +2048,7 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } { // test and predicate, as bsi is registered f1_equals and f2_greater_than all take @@ -2066,7 +2066,7 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { ])"}, &expected_array); ASSERT_TRUE(array_status.ok()); - CheckResult(path, {data_split}, predicate, expected_array); + CheckResult(path, {split}, predicate, expected_array); } } } @@ -2105,7 +2105,7 @@ TEST_P(ReadInteWithIndexTest, TestWithIOException) { /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, /*bucket_path=*/path + "bucket-0/", {data_file_meta}); - ASSERT_OK_AND_ASSIGN(auto data_split, + ASSERT_OK_AND_ASSIGN(auto split, builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); auto predicate = @@ -2134,8 +2134,7 @@ TEST_P(ReadInteWithIndexTest, TestWithIOException) { ASSERT_OK_AND_ASSIGN(auto read_context, context_builder.Finish()); Result> table_read = TableRead::Create(std::move(read_context)); CHECK_HOOK_STATUS(table_read.status(), i); - Result> batch_reader = - table_read.value()->CreateReader(data_split); + Result> batch_reader = table_read.value()->CreateReader(split); CHECK_HOOK_STATUS(batch_reader.status(), i); auto result = ReadResultCollector::CollectResult(batch_reader.value().get()); CHECK_HOOK_STATUS(result.status(), i); diff --git a/test/inte/scan_and_read_inte_test.cpp b/test/inte/scan_and_read_inte_test.cpp index 708f2119..3ceea877 100644 --- a/test/inte/scan_and_read_inte_test.cpp +++ b/test/inte/scan_and_read_inte_test.cpp @@ -136,7 +136,7 @@ class ScanAndReadInteTest : public testing::Test, void AdjustSplitWithExternalPath(const std::string& src_path, const std::string& target_path, bool adjust_index, - std::vector>* splits_ptr) { + std::vector>* splits_ptr) { // adjust external path from src_path to target_path auto& splits = *splits_ptr; for (auto& split : splits) { diff --git a/test/inte/write_and_read_inte_test.cpp b/test/inte/write_and_read_inte_test.cpp index 6ab26602..ed95081c 100644 --- a/test/inte/write_and_read_inte_test.cpp +++ b/test/inte/write_and_read_inte_test.cpp @@ -104,7 +104,7 @@ TEST_P(WriteAndReadInteTest, TestAppendSimple) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ [0, "banana", 2], @@ -165,7 +165,7 @@ TEST_P(WriteAndReadInteTest, TestPKSimple) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string data = R"([ [0, "apple", 20, 23.0], @@ -222,7 +222,7 @@ TEST_P(WriteAndReadInteTest, TestNestedType) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ @@ -322,7 +322,7 @@ TEST_P(WriteAndReadInteTest, TestAppendExternalPath) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ [0, "Alice", 10, 0, 11.1], @@ -408,7 +408,7 @@ TEST_P(WriteAndReadInteTest, TestAppendExternalPathAndNoneExternalPathStrategy) fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ [0, "Alice", 10, 0, 11.1], @@ -465,7 +465,7 @@ TEST_P(WriteAndReadInteTest, TestAppendTimestampType) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ [0, "1970-01-01 00:00:01", "1970-01-01 00:00:00.001", "1970-01-01 00:00:00.000001", "1970-01-01 00:00:00.000000001", "1970-01-01 00:00:02", "1970-01-01 00:00:00.002", "1970-01-01 00:00:00.000002", "1970-01-01 00:00:00.000000002"], @@ -521,7 +521,7 @@ TEST_P(WriteAndReadInteTest, TestPkTimestampType) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); std::string expected_data = R"([ [0, "1970-01-01 00:00:01", "1969-01-01 00:00:00.003", "1970-01-01 00:00:00.000001", "1970-01-01 00:00:00.000000003", "1970-01-01 00:00:04", "1970-01-01 00:00:00.004", "1970-01-01 00:00:00.000004", "1970-01-01 00:00:00.000000004", 0], diff --git a/test/inte/write_inte_test.cpp b/test/inte/write_inte_test.cpp index 6fa72b27..e526d359 100644 --- a/test/inte/write_inte_test.cpp +++ b/test/inte/write_inte_test.cpp @@ -340,7 +340,7 @@ TEST_P(WriteInteTest, TestAppendTableBatchWrite) { arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 1); std::string expected_data_1 = @@ -448,7 +448,7 @@ TEST_P(WriteInteTest, TestAppendTableStreamWriteWithOneBucket) { [0, null, 1, 32767, 2147483647, null, null, 2.0, 3.141592657, null, "lucy"], [0, true, -2, -32768, -2147483648, null, -4294967298, 2.0, 3.141592657, "20250326", "mouse"]])"; - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 1); ASSERT_OK_AND_ASSIGN(bool success, @@ -507,7 +507,7 @@ TEST_P(WriteInteTest, TestAppendTableStreamWriteWithOneBucket) { ASSERT_EQ(2, snapshot2.value().Id()); ASSERT_EQ(7, snapshot2.value().TotalRecordCount().value()); ASSERT_EQ(3, snapshot2.value().DeltaRecordCount().value()); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); ASSERT_EQ(data_splits_2.size(), 1); std::string expected_data_2 = @@ -574,7 +574,7 @@ TEST_P(WriteInteTest, TestAppendTableStreamWriteWithPartitionAndMultiBuckets) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 3); @@ -627,7 +627,7 @@ TEST_P(WriteInteTest, TestAppendTableStreamWriteWithPartitionAndMultiBuckets) { ASSERT_EQ(16, snapshot2.value().TotalRecordCount().value()); ASSERT_EQ(8, snapshot2.value().DeltaRecordCount().value()); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); ASSERT_EQ(data_splits_2.size(), 3); std::map, std::string> expected_datas_2; @@ -774,7 +774,7 @@ TEST_P(WriteInteTest, TestAppendTableWriteWithComplexType) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 1); std::string expected_data_1 = R"([ @@ -832,7 +832,7 @@ TEST_P(WriteInteTest, TestAppendTableWriteWithComplexType) { ASSERT_EQ(10, snapshot2.value().TotalRecordCount().value()); ASSERT_EQ(4, snapshot2.value().DeltaRecordCount().value()); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); ASSERT_EQ(data_splits_2.size(), 1); std::string expected_data_2 = R"([ [0, [[10, 11]], [1.1, 1.3], [true, 1], "1970-01-01 00:02:03.999999", 24, "0.28"], @@ -970,7 +970,7 @@ TEST_P(WriteInteTest, TestPkTableStreamWrite) { arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 3); @@ -1087,7 +1087,7 @@ TEST_P(WriteInteTest, TestPkTableStreamWrite) { ASSERT_EQ(4, snapshot2.value().DeltaRecordCount().value()); // round 2 read - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); ASSERT_EQ(data_splits_2.size(), 3); std::map, std::string> expected_datas_2; expected_datas_2[std::make_pair("f1=20250326/", 0)] = R"([[0, "Farm", "20250326", 15, 22.1]])"; @@ -1225,7 +1225,7 @@ TEST_P(WriteInteTest, TestPkTableBatchWrite) { ASSERT_EQ(5, snapshot1.value().TotalRecordCount().value()); ASSERT_EQ(5, snapshot1.value().DeltaRecordCount().value()); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 3); @@ -1365,7 +1365,7 @@ TEST_P(WriteInteTest, TestPkTableWriteWithNoPartitionKey) { arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 2); @@ -1455,7 +1455,7 @@ TEST_P(WriteInteTest, TestPkTableWriteWithNoPartitionKey) { ASSERT_EQ(4, snapshot2.value().DeltaRecordCount().value()); // round2 read - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); ASSERT_EQ(data_splits_2.size(), 2); std::map expected_datas_2; expected_datas_2[0] = @@ -1626,7 +1626,7 @@ TEST_P(WriteInteTest, TestPkTableWriteWithComplexType) { arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 1); std::string expected_data_1 = R"([ @@ -1695,7 +1695,7 @@ TEST_P(WriteInteTest, TestPkTableWriteWithComplexType) { ASSERT_EQ(9, snapshot2.value().TotalRecordCount().value()); ASSERT_EQ(4, snapshot2.value().DeltaRecordCount().value()); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); ASSERT_EQ(data_splits_2.size(), 1); std::string expected_data_2 = R"([ [3, [[127, 32767], [-128, -32768]], [1.1, 1.2], [false, 2222], "1970-01-01 00:02:03.123123", 245, "0.12"], @@ -1760,7 +1760,7 @@ TEST_P(WriteInteTest, TestPkTableForceLookup) { arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt, /*is_streaming=*/false)); ASSERT_EQ(data_splits.size(), 1); @@ -1826,7 +1826,7 @@ TEST_P(WriteInteTest, TestPkTableEnableDeletionVector) { arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt, /*is_streaming=*/false)); ASSERT_TRUE(data_splits.empty()); @@ -2119,7 +2119,7 @@ TEST_F(WriteInteTest, TestAppendTableWriteWithAlterTable) { ASSERT_OK_AND_ASSIGN(auto helper, TestHelper::Create(table_path, options, /*is_streaming_mode=*/true)); // scan with empty split - ASSERT_OK_AND_ASSIGN(std::vector> empty_splits, + ASSERT_OK_AND_ASSIGN(std::vector> empty_splits, helper->NewScan(StartupMode::Latest(), /*snapshot_id=*/std::nullopt)); ASSERT_TRUE(empty_splits.empty()); @@ -2161,7 +2161,7 @@ TEST_F(WriteInteTest, TestAppendTableWriteWithAlterTable) { ASSERT_EQ(3, snapshot.value().Id()); // read - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->Scan()); ASSERT_EQ(data_splits.size(), 1); arrow::FieldVector fields_with_row_kind = fields; @@ -2697,7 +2697,7 @@ TEST_P(WriteInteTest, TestAppendTableStreamWriteWithExternalPath) { arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 1); std::string expected_data_1 = @@ -2760,7 +2760,7 @@ TEST_P(WriteInteTest, TestAppendTableStreamWriteWithExternalPath) { ASSERT_EQ(2, snapshot2.value().Id()); ASSERT_EQ(7, snapshot2.value().TotalRecordCount().value()); ASSERT_EQ(3, snapshot2.value().DeltaRecordCount().value()); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_2, helper->Scan()); ASSERT_EQ(data_splits_2.size(), 1); std::string expected_data_2 = fmt::format(R"([ @@ -3372,7 +3372,7 @@ TEST_P(WriteInteTest, TestPkTablePostponeBucket) { ASSERT_EQ(5, snapshot1.value().TotalRecordCount().value()); ASSERT_EQ(5, snapshot1.value().DeltaRecordCount().value()); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits.size(), 1); @@ -3805,7 +3805,7 @@ TEST_P(WriteInteTest, TestAppendTableWriteWithBlobType) { ASSERT_EQ(4, snapshot.value().NextRowId().value()); // check data file meta after commit - ASSERT_OK_AND_ASSIGN(std::vector> data_splits, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits.size(), 1); auto data_split = std::dynamic_pointer_cast(data_splits[0]); @@ -3878,7 +3878,7 @@ TEST_P(WriteInteTest, TestAppendTableWithDateFieldAsPartitionField) { fields_with_row_kind.insert(fields_with_row_kind.begin(), arrow::field("_VALUE_KIND", arrow::int8())); auto data_type = arrow::struct_(fields_with_row_kind); - ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, + ASSERT_OK_AND_ASSIGN(std::vector> data_splits_1, helper->NewScan(StartupMode::LatestFull(), /*snapshot_id=*/std::nullopt)); ASSERT_EQ(data_splits_1.size(), 1); std::string expected_data_1 = diff --git a/test/test_data/fileindex/bitmap-index-v1 b/test/test_data/file_index/bitmap-index-v1 similarity index 100% rename from test/test_data/fileindex/bitmap-index-v1 rename to test/test_data/file_index/bitmap-index-v1 diff --git a/test/test_data/fileindex/bitmap-index-v2 b/test/test_data/file_index/bitmap-index-v2 similarity index 100% rename from test/test_data/fileindex/bitmap-index-v2 rename to test/test_data/file_index/bitmap-index-v2 diff --git a/test/test_data/global_index/indexed_split-01 b/test/test_data/global_index/indexed_split-01 new file mode 100644 index 00000000..8aec4645 Binary files /dev/null and b/test/test_data/global_index/indexed_split-01 differ diff --git a/test/test_data/global_index/indexed_split-02 b/test/test_data/global_index/indexed_split-02 new file mode 100644 index 00000000..ab40bc56 Binary files /dev/null and b/test/test_data/global_index/indexed_split-02 differ