Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/paimon/global_index/bitmap_global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "paimon/visibility.h"

namespace paimon {
/// Represents a global index query result that **lazily materializes** its matching row IDs as a
/// Represents a global index query result that **lazily materializes** its matching row ids as a
/// Roaring bitmap. The underlying 64-bit Roaring bitmap is **not constructed during object
/// creation**; instead, it is built on-demand the first time GetBitmap() is called. This design
/// avoids unnecessary computation and memory allocation when the bitmap is not needed (e.g., during
Expand Down Expand Up @@ -67,6 +67,8 @@ class PAIMON_EXPORT BitmapGlobalIndexResult : public GlobalIndexResult {

Result<bool> IsEmpty() const override;

Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) override;

std::string ToString() const override;

/// @return A non-owning, const pointer to the bitmap. The returned pointer is valid as long as
Expand Down
10 changes: 6 additions & 4 deletions include/paimon/global_index/bitmap_topk_global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
#include "paimon/visibility.h"

namespace paimon {
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row IDs
/// Represents a Top-K global index result that combines a Roaring bitmap of candidate row ids
/// with an array of associated relevance scores.
///
/// **Important Ordering Note**: Despite inheriting from TopKGlobalIndexResult, the results are
/// **NOT sorted by score**. Instead, both the bitmap and the score vector are ordered by
/// **ascending row ID**. This design enables efficient merging and set operations while preserving
/// **ascending row id**. This design enables efficient merging and set operations while preserving
/// row id-to-score mapping.
class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
public:
Expand Down Expand Up @@ -74,16 +74,18 @@ class PAIMON_EXPORT BitmapTopKGlobalIndexResult : public TopKGlobalIndexResult {
Result<std::shared_ptr<GlobalIndexResult>> Or(
const std::shared_ptr<GlobalIndexResult>& other) override;

Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) override;

Result<bool> IsEmpty() const override;

std::string ToString() const override;

/// @return A non-owning, const pointer to the bitmap. The row IDs in the bitmap are stored in
/// @return A non-owning, const pointer to the bitmap. The row ids in the bitmap are stored in
/// ascending order (as guaranteed by Roaring64 iteration).
Result<const RoaringBitmap64*> GetBitmap() const;

/// @return A const reference to a vector of float scores, where the i-th element corresponds to
/// the i-th row ID when iterating the bitmap in **ascending row ID order**.
/// the i-th row id when iterating the bitmap in **ascending row id order**.
const std::vector<float>& GetScores() const;

private:
Expand Down
8 changes: 4 additions & 4 deletions include/paimon/global_index/global_index_io_meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@
namespace paimon {
/// Metadata describing a single file entry in a global index.
struct PAIMON_EXPORT GlobalIndexIOMeta {
GlobalIndexIOMeta(const std::string& _file_name, int64_t _file_size, const Range& _row_id_range,
GlobalIndexIOMeta(const std::string& _file_name, int64_t _file_size, int64_t _range_end,
const std::shared_ptr<Bytes>& _metadata)
: file_name(_file_name),
file_size(_file_size),
row_id_range(_row_id_range),
range_end(_range_end),
metadata(_metadata) {}

std::string file_name;
int64_t file_size;
/// The inclusive range of row IDs covered by this file (i.e., [from, to]).
Range row_id_range;
/// The inclusive range end covered by this file (i.e., the last local row id).
int64_t range_end;
/// Optional binary metadata associated with the file, such as serialized
/// secondary index structures or inline index bytes.
/// May be null if no additional metadata is available.
Expand Down
18 changes: 11 additions & 7 deletions include/paimon/global_index/global_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,20 @@

namespace paimon {
/// Reads and evaluates filter predicates against a global file index.
/// `GlobalIndexReader` is an implementation of the `FunctionVisitor` interface
/// specialized to produce `std::shared_ptr<GlobalIndexResult>` objects.
///
/// Derived classes are expected to implement the visitor methods (e.g., `VisitEqual`,
/// `VisitIsNull`, etc.) to return index-based results that indicate which
/// row satisfy the given predicate.
///
/// @note All `GlobalIndexResult` objects returned by implementations of this class use **local row
/// ids** that start from 0 — not global row ids in the entire table.
/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`.
class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
public:
/// TopKPreFilter: A lightweight pre-filtering function applied **before** similarity scoring.
/// It operates solely on row_id and is typically driven by other global index, such as bitmap,
/// or range index. This filter enables early pruning of irrelevant candidates (e.g., "only
/// consider rows with label X"), significantly reducing the search space. Returns true to
/// It operates solely on **local row ids** and is typically driven by other global index, such
/// as bitmap, or range index. This filter enables early pruning of irrelevant candidates (e.g.,
/// "only consider rows with label X"), significantly reducing the search space. Returns true to
/// include the row in Top-K computation; false to exclude it.
///
/// @note Must be thread-safe.
Expand All @@ -47,7 +49,8 @@ class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<G
///
/// @param k Number of top results to return.
/// @param query The query vector (must match the dimensionality of the indexed vectors).
/// @param filter A pre-filter based on row_id, implemented by leveraging other global index
/// @param filter A pre-filter based on **local row ids**, implemented by leveraging other
/// global index
/// structures (e.g., bitmap index) for efficient candidate pruning.
/// @param predicate A runtime filtering condition that may involve graph traversal of
/// structured attributes. **Using this parameter often yields better
Expand All @@ -58,7 +61,8 @@ class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<G
/// context-aware filtering at query time.
/// @note All fields referenced in the predicate must have been materialized
/// in the index during build to ensure availability.
/// @note `VisitTopK` is thread-safe while other `VisitXXX` is not.
/// @note `VisitTopK` is thread-safe (not coroutine-safe) while other `VisitXXX` is not
/// thread-safe.
virtual Result<std::shared_ptr<TopKGlobalIndexResult>> VisitTopK(
int32_t k, const std::vector<float>& query, TopKPreFilter filter,
const std::shared_ptr<Predicate>& predicate) = 0;
Expand Down
10 changes: 7 additions & 3 deletions include/paimon/global_index/global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
virtual int64_t Next() = 0;
};

/// Checks whether the global index result contains no matching row IDs.
/// Checks whether the global index result contains no matching row ids.
///
/// @return A `Result<bool>` where:
/// - `true` indicates the result is empty (no matching rows),
Expand All @@ -67,6 +67,10 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
virtual Result<std::shared_ptr<GlobalIndexResult>> Or(
const std::shared_ptr<GlobalIndexResult>& other);

/// Adds the given offset to each row id in current result and returns the new global index
/// result.
virtual Result<std::shared_ptr<GlobalIndexResult>> AddOffset(int64_t offset) = 0;

virtual std::string ToString() const = 0;

/// Serializes a GlobalIndexResult object into a byte array.
Expand Down Expand Up @@ -103,7 +107,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
};

/// Represents the result of a Top-K query against a global index.
/// This class encapsulates a set of top-K candidates (row ID + score pairs) and provides
/// This class encapsulates a set of top-K candidates (row id + score pairs) and provides
/// an iterator interface to traverse them.
class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
public:
Expand All @@ -115,7 +119,7 @@ class PAIMON_EXPORT TopKGlobalIndexResult : public GlobalIndexResult {
public:
virtual ~TopKIterator() = default;

/// Checks whether more row IDs are available.
/// Checks whether more row ids are available.
virtual bool HasNext() const = 0;

/// Retrieves the next (row_id, score) pair and advances the iterator.
Expand Down
14 changes: 7 additions & 7 deletions include/paimon/global_index/global_index_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class PAIMON_EXPORT GlobalIndexScan {
/// Creates a `GlobalIndexScan` instance for the specified table and context.
///
/// @param table_path Root directory of the table.
/// @param snapshot_id Optional snapshot ID to read from; if not provided, uses the latest.
/// @param snapshot_id Optional snapshot id to read from; if not provided, uses the latest.
/// @param partitions Optional list of specific partitions to restrict the scan scope.
/// Each map represents one partition (e.g., {"dt": "2024-06-01"}).
/// If omitted, scans all partitions.
Expand Down Expand Up @@ -65,23 +65,23 @@ class PAIMON_EXPORT GlobalIndexScan {

virtual ~GlobalIndexScan() = default;

/// Creates a scanner for the global index over the specified row ID range.
/// Creates a scanner for the global index over the specified row id range.
///
/// This method instantiates a low-level scanner that can evaluate predicates and
/// retrieve matching row IDs from the global index data corresponding to the given
/// row ID range.
/// retrieve matching row ids from the global index data corresponding to the given
/// row id range.
///
/// @param range The inclusive row ID range [start, end] for which to create the scanner.
/// @param range The inclusive row id range [start, end] for which to create the scanner.
/// The range must be fully covered by existing global index data (from
/// `GetRowRangeList()`).
/// @return A `Result` containing a range-level scanner, or an error if parse index meta fails.
virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
const Range& range) = 0;

/// Returns row ID ranges covered by this global index (sorted and non-overlapping
/// Returns row id ranges covered by this global index (sorted and non-overlapping
/// ranges).
///
/// Each `Range` represents a contiguous segment of row IDs for which global index
/// Each `Range` represents a contiguous segment of row ids for which global index
/// data exists. This allows the query engine to parallelize scanning and be aware
/// of ranges that are not covered by any global index.
///
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@

namespace paimon {
/// Writes a range-level global index for a specific data split and field.
class PAIMON_EXPORT RowRangeGlobalIndexWriter {
class PAIMON_EXPORT GlobalIndexWriteTask {
public:
RowRangeGlobalIndexWriter() = delete;
~RowRangeGlobalIndexWriter() = delete;
GlobalIndexWriteTask() = delete;
~GlobalIndexWriteTask() = delete;
/// Builds and writes a global index for the specified data range.
///
/// @param table_path Path to the table root directory where index files are stored.
Expand Down
12 changes: 2 additions & 10 deletions include/paimon/global_index/row_range_global_index_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include <memory>
#include <string>

#include "paimon/global_index/global_index_evaluator.h"
#include "paimon/global_index/global_index_reader.h"
#include "paimon/visibility.h"

Expand All @@ -29,15 +28,6 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
public:
virtual ~RowRangeGlobalIndexScanner() = default;

/// Creates a `GlobalIndexEvaluator` tailored to this range's index layout.
///
/// The returned evaluator can be used to assess whether a given predicate can be
/// answered using the global index data of this shard (e.g., via bitmap intersection).
///
/// @return A `Result` containing a shared pointer to the evaluator, or an error
/// if the index metadata is invalid or unsupported.
virtual Result<std::shared_ptr<GlobalIndexEvaluator>> CreateIndexEvaluator() const = 0;

/// Creates a `GlobalIndexReader` for a specific field and index type within this range.
///
/// This reader provides low-level access to the serialized index data
Expand All @@ -50,6 +40,8 @@ class PAIMON_EXPORT RowRangeGlobalIndexScanner {
/// - Successful with a null pointer if no index was built for the given field and type;
/// - An error only if loading fails (e.g., file corruption, I/O error, unsupported
/// format).
/// @note All `GlobalIndexResult` objects returned by `GlobalIndexReader` use **local row
/// ids** that start from 0 — not global row ids in the entire table.
virtual Result<std::shared_ptr<GlobalIndexReader>> CreateReader(
const std::string& field_name, const std::string& index_type) const = 0;

Expand Down
6 changes: 3 additions & 3 deletions include/paimon/schema/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class PAIMON_EXPORT Schema {
virtual std::vector<std::string> FieldNames() const = 0;

/// Get the unique identifier of this table schema.
/// @return The schema ID
/// @return The schema id
virtual int64_t Id() const = 0;

/// Get the list of primary key field names.
Expand All @@ -65,8 +65,8 @@ class PAIMON_EXPORT Schema {
/// @return The number of buckets.
virtual int32_t NumBuckets() const = 0;

/// Get the highest field ID assigned in this schema.
/// @return The maximum field ID.
/// Get the highest field id assigned in this schema.
/// @return The maximum field id.
virtual int32_t HighestFieldId() const = 0;

/// Get the table-level options associated with this schema.
Expand Down
6 changes: 3 additions & 3 deletions include/paimon/utils/bucket_id_calculator.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ struct ArrowArray;
namespace paimon {
class MemoryPool;

/// Calculator for determining bucket IDs based on the given bucket keys.
/// Calculator for determining bucket ids based on the given bucket keys.
///
/// @note `BucketIdCalculator` is compatible with the Java implementation and uses
/// hash-based distribution to ensure even data distribution across buckets.
Expand All @@ -47,10 +47,10 @@ class PAIMON_EXPORT BucketIdCalculator {
/// @param num_buckets Number of buckets.
static Result<std::unique_ptr<BucketIdCalculator>> Create(bool is_pk_table,
int32_t num_buckets);
/// Calculate bucket IDs for the given bucket keys.
/// Calculate bucket ids for the given bucket keys.
/// @param bucket_keys Arrow struct array containing the bucket key values.
/// @param bucket_schema Arrow schema describing the structure of bucket_keys.
/// @param bucket_ids Output array to store calculated bucket IDs.
/// @param bucket_ids Output array to store calculated bucket ids.
/// @note 1. bucket_keys is a struct array, the order of fields needs to be consistent with
/// "bucket-key" options in table schema. 2. bucket_keys and bucket_schema match each other. 3.
/// bucket_ids is allocated enough space, at least >= bucket_keys->length
Expand Down
3 changes: 3 additions & 0 deletions include/paimon/utils/roaring_bitmap32.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ class PAIMON_EXPORT RoaringBitmap32 {
/// Fast union multiple bitmaps.
static RoaringBitmap32 FastUnion(const std::vector<RoaringBitmap32>& inputs);

class RoaringBitmap64;
friend class RoaringBitmap64;

private:
void* roaring_bitmap_ = nullptr;
};
Expand Down
4 changes: 4 additions & 0 deletions include/paimon/utils/roaring_bitmap64.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "paimon/memory/bytes.h"
#include "paimon/memory/memory_pool.h"
#include "paimon/status.h"
#include "paimon/utils/roaring_bitmap32.h"
#include "paimon/visibility.h"

namespace paimon {
Expand All @@ -42,6 +43,9 @@ class PAIMON_EXPORT RoaringBitmap64 {
RoaringBitmap64(RoaringBitmap64&&) noexcept;
RoaringBitmap64& operator=(RoaringBitmap64&&) noexcept;

explicit RoaringBitmap64(const RoaringBitmap32&) noexcept;
RoaringBitmap64& operator=(const RoaringBitmap32&) noexcept;

class PAIMON_EXPORT Iterator {
public:
friend class RoaringBitmap64;
Expand Down
2 changes: 1 addition & 1 deletion src/paimon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ set(PAIMON_CORE_SRCS
core/global_index/global_index_scan.cpp
core/global_index/global_index_scan_impl.cpp
core/global_index/row_range_global_index_scanner_impl.cpp
core/global_index/row_range_global_index_writer.cpp
core/global_index/global_index_write_task.cpp
core/index/index_file_handler.cpp
core/index/global_index_meta.cpp
core/index/index_file_meta_serializer.cpp
Expand Down
18 changes: 7 additions & 11 deletions src/paimon/common/global_index/bitmap/bitmap_global_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,33 +45,29 @@ Result<std::shared_ptr<GlobalIndexReader>> BitmapGlobalIndex::CreateReader(
PAIMON_ASSIGN_OR_RAISE(
std::shared_ptr<FileIndexReader> reader,
index_->CreateReader(arrow_schema, /*start=*/0, meta.file_size, in, pool));
auto transform = [range = meta.row_id_range](const std::shared_ptr<FileIndexResult>& result)
auto transform = [range_end = meta.range_end](const std::shared_ptr<FileIndexResult>& result)
-> Result<std::shared_ptr<GlobalIndexResult>> {
return ToGlobalIndexResult(range, result);
return ToGlobalIndexResult(range_end, result);
};
return std::make_shared<FileIndexReaderWrapper>(reader, transform);
}

Result<std::shared_ptr<GlobalIndexResult>> BitmapGlobalIndex::ToGlobalIndexResult(
const Range& range, const std::shared_ptr<FileIndexResult>& result) {
int64_t range_end, const std::shared_ptr<FileIndexResult>& result) {
if (auto remain = std::dynamic_pointer_cast<Remain>(result)) {
return std::make_shared<BitmapGlobalIndexResult>([range]() -> Result<RoaringBitmap64> {
return std::make_shared<BitmapGlobalIndexResult>([range_end]() -> Result<RoaringBitmap64> {
RoaringBitmap64 bitmap;
bitmap.AddRange(range.from, range.to + 1);
bitmap.AddRange(0, range_end + 1);
return bitmap;
});
} else if (auto skip = std::dynamic_pointer_cast<Skip>(result)) {
return std::make_shared<BitmapGlobalIndexResult>(
[]() -> Result<RoaringBitmap64> { return RoaringBitmap64(); });
} else if (auto bitmap_result = std::dynamic_pointer_cast<BitmapIndexResult>(result)) {
return std::make_shared<BitmapGlobalIndexResult>(
[range, bitmap_result]() -> Result<RoaringBitmap64> {
[bitmap_result]() -> Result<RoaringBitmap64> {
PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap32* bitmap, bitmap_result->GetBitmap());
RoaringBitmap64 bitmap64;
for (auto iter = bitmap->Begin(); iter != bitmap->End(); ++iter) {
bitmap64.Add(range.from + (*iter));
}
return bitmap64;
return RoaringBitmap64(*bitmap);
});
}
return Status::Invalid(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ class BitmapGlobalIndex : public GlobalIndexer {

private:
static Result<std::shared_ptr<GlobalIndexResult>> ToGlobalIndexResult(
const Range& range, const std::shared_ptr<FileIndexResult>& result);
int64_t range_end, const std::shared_ptr<FileIndexResult>& result);

private:
std::shared_ptr<BitmapFileIndex> index_;
Expand Down
Loading
Loading