Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions include/paimon/global_index/global_index_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,15 @@ class PAIMON_EXPORT GlobalIndexScan {
virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
const Range& range) = 0;

/// Returns the set of row ID ranges covered by this global index.
/// Returns row ID ranges covered by this global index (sorted and non-overlapping
/// ranges).
///
/// Each `Range` represents a contiguous segment of row IDs for which global index
/// data exists. This allows the query engine to parallelize scanning and be aware
/// of ranges that are not covered by any global index.
///
/// @return A `Result` containing a set of non-overlapping `Range` objects.
virtual Result<std::set<Range>> GetRowRangeList() = 0;
/// @return A `Result` containing sorted and non-overlapping `Range` objects.
virtual Result<std::vector<Range>> GetRowRangeList() = 0;
};

} // namespace paimon
45 changes: 45 additions & 0 deletions include/paimon/global_index/indexed_split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "paimon/table/source/data_split.h"
#include "paimon/utils/range.h"
#include "paimon/visibility.h"

namespace paimon {
/// Indexed split for global index reading operation.
class PAIMON_EXPORT IndexedSplit : public Split {
public:
/// @returns The underlying physical data split containing actual data file details.
virtual std::shared_ptr<DataSplit> GetDataSplit() const = 0;

/// @returns A list of row intervals [start, end] indicating which rows
/// are relevant (e.g., passed predicate pushdown).
virtual const std::vector<Range>& RowRanges() const = 0;

/// @returns A score for **each individual row** included in `RowRanges()`,
/// in the order they appear when traversing the ranges.
virtual const std::vector<float>& Scores() const = 0;
};
} // namespace paimon
8 changes: 4 additions & 4 deletions include/paimon/global_index/row_range_global_index_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
#include <map>
#include <string>

#include "paimon/global_index/indexed_split.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/table/source/data_split.h"
#include "paimon/utils/range.h"
#include "paimon/visibility.h"

Expand All @@ -35,8 +35,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
/// @param table_path Path to the table root directory where index files are stored.
/// @param field_name Name of the indexed column (must be present in the table schema).
/// @param index_type Type of global index to build (e.g., "bitmap", "lumina").
/// @param split The data split (e.g., Parquet file) containing the actual data.
/// @param range Row ID range [from, to] for data to build index.
/// @param index_split The indexed split containing the actual data (e.g., Parquet file) and
// row id range [from, to] for data to build index.
/// The range must be fully contained within the data covered
/// by the given `split`.
/// @param options Index-specific configuration (e.g., false positive rate for bloom
Expand All @@ -47,7 +47,7 @@ class PAIMON_EXPORT RowRangeGlobalIndexWriter {
/// or an error if indexing fails (e.g., unsupported type, I/O error).
static Result<std::shared_ptr<CommitMessage>> WriteIndex(
const std::string& table_path, const std::string& field_name, const std::string& index_type,
const std::shared_ptr<DataSplit>& split, const Range& range,
const std::shared_ptr<IndexedSplit>& indexed_split,
const std::map<std::string, std::string>& options, const std::shared_ptr<MemoryPool>& pool);
};

Expand Down
22 changes: 2 additions & 20 deletions include/paimon/read_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include "paimon/predicate/predicate.h"
#include "paimon/result.h"
#include "paimon/type_fwd.h"
#include "paimon/utils/range.h"
#include "paimon/visibility.h"

namespace paimon {
Expand All @@ -43,8 +42,8 @@ class PAIMON_EXPORT ReadContext {
public:
ReadContext(const std::string& path, const std::string& branch,
const std::vector<std::string>& read_schema,
const std::shared_ptr<Predicate>& predicate, const std::vector<Range>& row_ranges,
bool enable_predicate_filter, bool enable_prefetch, uint32_t prefetch_batch_count,
const std::shared_ptr<Predicate>& predicate, bool enable_predicate_filter,
bool enable_prefetch, uint32_t prefetch_batch_count,
uint32_t prefetch_max_parallel_num, bool enable_multi_thread_row_to_batch,
uint32_t row_to_batch_thread_number, const std::optional<std::string>& table_schema,
const std::shared_ptr<MemoryPool>& memory_pool,
Expand Down Expand Up @@ -77,10 +76,6 @@ class PAIMON_EXPORT ReadContext {
return predicate_;
}

const std::vector<Range>& GetRowRanges() const {
return row_ranges_;
}

bool EnablePredicateFilter() const {
return enable_predicate_filter_;
}
Expand Down Expand Up @@ -114,7 +109,6 @@ class PAIMON_EXPORT ReadContext {
std::string branch_;
std::vector<std::string> read_schema_;
std::shared_ptr<Predicate> predicate_;
std::vector<Range> row_ranges_;
bool enable_predicate_filter_;
bool enable_prefetch_;
uint32_t prefetch_batch_count_;
Expand Down Expand Up @@ -273,18 +267,6 @@ class PAIMON_EXPORT ReadContextBuilder {
ReadContextBuilder& WithFileSystemSchemeToIdentifierMap(
const std::map<std::string, std::string>& fs_scheme_to_identifier_map);

/// Set specific row ranges to read for targeted data access.
///
/// This is primarily used in data evolution scenarios where only specific rows
/// need to be read. File ranges that do not intersect with the specified row ranges
/// will be filtered out, improving performance by avoiding unnecessary I/O.
///
/// @param row_ranges Vector of specific row ranges to read.
/// @return Reference to this builder for method chaining.
/// @note If not set, all rows in the selected files will be returned.
/// @note This is commonly used in data evolution mode for selective reading.
ReadContextBuilder& SetRowRanges(const std::vector<Range>& row_ranges);

/// Build and return a `ReadContext` instance with input validation.
/// @return Result containing the constructed `ReadContext` or an error status.
Result<std::unique_ptr<ReadContext>> Finish();
Expand Down
34 changes: 4 additions & 30 deletions include/paimon/table/source/data_split.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,42 +26,15 @@
#include "paimon/data/timestamp.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/table/source/split.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;

/// Input splits for read operation. Needed by most batch computation engines. Support Serialize and
/// Deserialize, compatible with java version.
class PAIMON_EXPORT DataSplit {
/// Input data split for reading operation. Needed by most batch computation engines.
class PAIMON_EXPORT DataSplit : public Split {
public:
virtual ~DataSplit() = default;

/// Deserialize a `DataSplit` from a binary buffer.
///
/// Creates a `DataSplit` instance from its serialized binary representation.
/// This is typically used in distributed computing scenarios where splits
/// are transmitted between different nodes or processes.
///
/// @param buffer Const pointer to the binary data containing the serialized `DataSplit`.
/// @param length Size of the buffer in bytes.
/// @param pool Memory pool for allocating objects during deserialization.
/// @return Result containing the deserialized `DataSplit` or an error status.
static Result<std::shared_ptr<DataSplit>> Deserialize(const char* buffer, size_t length,
const std::shared_ptr<MemoryPool>& pool);

/// Serialize a `DataSplit` to a binary string.
///
/// Converts a `DataSplit` instance to its binary representation for storage
/// or transmission. The serialized data can later be deserialized using
/// the Deserialize method.
///
/// @param data_split The `DataSplit` instance to serialize.
/// @param pool Memory pool for allocating temporary objects during serialization.
/// @return Result containing the serialized binary data as a string or an error status.
static Result<std::string> Serialize(const std::shared_ptr<DataSplit>& data_split,
const std::shared_ptr<MemoryPool>& pool);

/// Metadata structure for simple data files.
///
/// Contains essential information about a data file including its location,
Expand Down Expand Up @@ -97,6 +70,7 @@ class PAIMON_EXPORT DataSplit {
std::optional<int64_t> delete_row_count;

bool operator==(const SimpleDataFileMeta& other) const;

std::string ToString() const;
};

Expand Down
4 changes: 2 additions & 2 deletions include/paimon/table/source/plan.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
#include <optional>
#include <vector>

#include "paimon/table/source/data_split.h"
#include "paimon/table/source/split.h"

namespace paimon {
/// %Result plan of this `TableScan`.
class PAIMON_EXPORT Plan {
public:
virtual ~Plan() = default;
/// %Result splits.
virtual const std::vector<std::shared_ptr<DataSplit>>& Splits() const = 0;
virtual const std::vector<std::shared_ptr<Split>>& Splits() const = 0;
/// Snapshot id of this plan, return `std::nullopt` if the table is empty.
virtual std::optional<int64_t> SnapshotId() const = 0;
};
Expand Down
66 changes: 66 additions & 0 deletions include/paimon/table/source/split.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright 2024-present Alibaba Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cstddef>
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include "paimon/memory/memory_pool.h"
#include "paimon/result.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;

/// An input split for reading operation. Needed by most batch computation engines. Support
/// Serialize and Deserialize, compatible with java version.
/// This split can be either a `DataSplit` (for direct data file reads) or an `IndexedSplit`
/// (for reads leveraging global indexes).
class PAIMON_EXPORT Split {
public:
virtual ~Split() = default;

/// Deserialize a `Split` from a binary buffer.
///
/// Creates a `Split` instance from its serialized binary representation.
/// This is typically used in distributed computing scenarios where splits
/// are transmitted between different nodes or processes.
///
/// @param buffer Const pointer to the binary data containing the serialized `Split`.
/// @param length Size of the buffer in bytes.
/// @param pool Memory pool for allocating objects during deserialization.
/// @return Result containing the deserialized `Split` or an error status.
static Result<std::shared_ptr<Split>> Deserialize(const char* buffer, size_t length,
const std::shared_ptr<MemoryPool>& pool);

/// Serialize a `Split` to a binary string.
///
/// Converts a `Split` instance to its binary representation for storage
/// or transmission. The serialized data can later be deserialized using
/// the Deserialize method.
///
/// @param split The `Split` instance to serialize.
/// @param pool Memory pool for allocating temporary objects during serialization.
/// @return Result containing the serialized binary data as a string or an error status.
static Result<std::string> Serialize(const std::shared_ptr<Split>& split,
const std::shared_ptr<MemoryPool>& pool);
};
} // namespace paimon
17 changes: 8 additions & 9 deletions include/paimon/table/source/table_read.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,14 @@
#include "paimon/read_context.h"
#include "paimon/reader/batch_reader.h"
#include "paimon/result.h"
#include "paimon/table/source/data_split.h"
#include "paimon/table/source/split.h"
#include "paimon/visibility.h"

namespace paimon {
class DataSplit;
class MemoryPool;
class ReadContext;

/// Given a `DataSplit` or a list of `DataSplit`, generate a reader for batch reading.
/// Given a `Split` or a list of `Split`, generate a reader for batch reading.
class PAIMON_EXPORT TableRead {
public:
virtual ~TableRead() = default;
Expand All @@ -46,21 +45,21 @@ class PAIMON_EXPORT TableRead {
/// Creates a `BatchReader` instance for reading data.
///
/// This method creates a BatchReader that will be responsible for reading data from the
/// provided data splits.
/// provided splits.
///
/// @param data_splits A vector of shared pointers to `DataSplit` instances representing the
/// @param splits A vector of shared pointers to `Split` instances representing the
/// data to be read.
/// @return A Result containing a unique pointer to the `BatchReader` instance.
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
const std::vector<std::shared_ptr<DataSplit>>& data_splits);
const std::vector<std::shared_ptr<Split>>& splits);

/// Creates a `BatchReader` instance for a single data split.
/// Creates a `BatchReader` instance for a single split.
///
/// @param data_split A shared pointer to the `DataSplit` instance that defines the data to be
/// @param split A shared pointer to the `Split` instance that defines the data to be
/// read.
/// @return A Result containing a unique pointer to the `BatchReader` instance.
virtual Result<std::unique_ptr<BatchReader>> CreateReader(
const std::shared_ptr<DataSplit>& data_split) = 0;
const std::shared_ptr<Split>& split) = 0;

protected:
explicit TableRead(const std::shared_ptr<MemoryPool>& memory_pool);
Expand Down
15 changes: 15 additions & 0 deletions include/paimon/utils/range.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once
#include <optional>
#include <string>
#include <vector>

#include "paimon/visibility.h"

Expand All @@ -28,9 +29,23 @@ struct PAIMON_EXPORT Range {
/// Returns the number of integers in the range [from, to].
int64_t Count() const;

/// Computes the intersection of two ranges.
static std::optional<Range> Intersection(const Range& left, const Range& right);

/// Checks whether two ranges have any overlap.
static bool HasIntersection(const Range& left, const Range& right);

/// Sorts a list of ranges by `from`, then merges overlapping or adjacent ranges.
/// @param ranges Input vector of ranges to merge.
/// @param adjacent If true, also merges ranges that are adjacent (e.g., [1,3] and [4,5] →
/// [1,5]).
/// If false, only merges strictly overlapping ranges.
/// @return A new vector of non-overlapping, sorted ranges.
static std::vector<Range> SortAndMergeOverlap(const std::vector<Range>& ranges, bool adjacent);

/// Computes the set intersection of two collections of disjoint, sorted ranges.
static std::vector<Range> And(const std::vector<Range>& left, const std::vector<Range>& right);

bool operator==(const Range& other) const;
bool operator<(const Range& other) const;

Expand Down
Loading
Loading