Skip to content

Commit dba1e0f

Browse files
feat: add multi-path support for lance data paths (#4765)
Fixes #4702 Supports writing to multi bases for lance datasets. Co-authored-by: Jack Ye <yezhaoqin@gmail.com>
1 parent a3ed68d commit dba1e0f

29 files changed

Lines changed: 2060 additions & 119 deletions

File tree

.github/workflows/rust.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ jobs:
156156
ALL_FEATURES=`cargo metadata --format-version=1 --no-deps | jq -r '.packages[] | .features | keys | .[]' | grep -v protoc | sort | uniq | paste -s -d "," -`
157157
cargo test --locked --features ${ALL_FEATURES}
158158
build-no-lock:
159-
runs-on: ubuntu-24.04
159+
runs-on: warp-ubuntu-2404-x64-8x
160160
timeout-minutes: 30
161161
env:
162162
# Need up-to-date compilers for kernels

java/lance-jni/src/blocking_dataset.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,7 @@ pub fn inner_commit_overwrite<'local>(
564564
fragments,
565565
schema,
566566
config_upsert_values: None,
567+
initial_bases: None,
567568
};
568569
let path_str = path.extract(env)?;
569570
let read_version = env.get_u64_opt(&read_version_obj)?;

java/lance-jni/src/transaction.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,7 @@ fn convert_to_java_operation_inner<'local>(
472472
fragments: rust_fragments,
473473
schema,
474474
config_upsert_values,
475+
initial_bases: _,
475476
} => {
476477
let java_fragments = export_vec(env, &rust_fragments)?;
477478
let java_schema = convert_to_java_schema(env, schema)?;
@@ -890,6 +891,7 @@ fn convert_to_rust_operation(
890891
fragments,
891892
schema,
892893
config_upsert_values,
894+
initial_bases: None,
893895
}
894896
}
895897
"Rewrite" => {

protos/transaction.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ message Transaction {
6868
map<string, bytes> schema_metadata = 3;
6969
// Key-value pairs to merge with existing config.
7070
map<string, string> config_upsert_values = 4;
71+
// The base paths to be added for the initial dataset creation
72+
repeated BasePath initial_bases = 5;
7173
}
7274

7375
// Add or replace a new secondary index.

python/python/lance/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
)
2727
from .fragment import FragmentMetadata, LanceFragment
2828
from .lance import (
29+
DatasetBasePath,
2930
FFILanceTableProvider,
3031
ScanStatistics,
3132
bytes_read_counter,
@@ -47,6 +48,7 @@
4748
__all__ = [
4849
"BlobColumn",
4950
"BlobFile",
51+
"DatasetBasePath",
5052
"DataStatistics",
5153
"FieldStatistics",
5254
"FragmentMetadata",

python/python/lance/dataset.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
CleanupStats,
5353
Compaction,
5454
CompactionMetrics,
55+
DatasetBasePath,
5556
LanceSchema,
5657
ScanStatistics,
5758
_Dataset,
@@ -4897,6 +4898,8 @@ def write_dataset(
48974898
auto_cleanup_options: Optional[AutoCleanupConfig] = None,
48984899
commit_message: Optional[str] = None,
48994900
transaction_properties: Optional[Dict[str, str]] = None,
4901+
initial_bases: Optional[List[DatasetBasePath]] = None,
4902+
target_bases: Optional[List[str]] = None,
49004903
) -> LanceDataset:
49014904
"""Write a given data_obj to the given uri
49024905
@@ -4975,6 +4978,19 @@ def write_dataset(
49754978
and can be retrieved using read_transaction().
49764979
If both `commit_message` and `properties` are provided, `commit_message` will
49774980
override any "lance.commit.message" key in `properties`.
4981+
initial_bases: list of DatasetBasePath, optional
4982+
New base paths to register in the manifest. Only used in **CREATE mode**.
4983+
Cannot be specified in APPEND or OVERWRITE modes.
4984+
target_bases: list of str, optional
4985+
References to base paths where data should be written. Can be
4986+
specified in all modes.
4987+
4988+
Each string is resolved by trying to match:
4989+
1. Base name (e.g., "primary", "archive") from registered bases
4990+
2. Base path URI (e.g., "s3://bucket1/data")
4991+
4992+
**CREATE mode**: References must match bases in `initial_bases`
4993+
**APPEND/OVERWRITE modes**: References must match bases in the existing manifest
49784994
"""
49794995
if use_legacy_format is not None:
49804996
warnings.warn(
@@ -5016,6 +5032,8 @@ def write_dataset(
50165032
"enable_stable_row_ids": enable_stable_row_ids,
50175033
"auto_cleanup_options": auto_cleanup_options,
50185034
"transaction_properties": merged_properties,
5035+
"initial_bases": initial_bases,
5036+
"target_bases": target_bases,
50195037
}
50205038

50215039
if commit_lock:

0 commit comments

Comments
 (0)