Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ serde = "1.0.197"
serde_json = "1.0.116"
serde_test = "1.0.176"
simplelog = { version = "0.12.2", features = ["paris"] }
static_assertions = "1"
tar = "0.4"
tempfile = "3"
thiserror = "1.0.58"
Expand Down
2 changes: 1 addition & 1 deletion bench-vortex/benches/compress_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ fn vortex_compress_tpch(c: &mut Criterion) {
let comments_canonical = comments
.into_canonical()
.unwrap()
.into_varbin()
.into_varbinview()
.unwrap()
.into_array();
group.bench_function("compress-fsst-canonicalized", |b| {
Expand Down
58 changes: 29 additions & 29 deletions bench-vortex/src/tpch/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,62 +7,62 @@ use lazy_static::lazy_static;
lazy_static! {
pub static ref NATION: Schema = Schema::new(vec![
Field::new("n_nationkey", DataType::Int64, false),
Field::new("n_name", DataType::Utf8, false),
Field::new("n_name", DataType::Utf8View, false),
Field::new("n_regionkey", DataType::Int64, false),
Field::new("n_comment", DataType::Utf8, true),
Field::new("n_comment", DataType::Utf8View, true),
]);
pub static ref REGION: Schema = Schema::new(vec![
Field::new("r_regionkey", DataType::Int64, false),
Field::new("r_name", DataType::Utf8, false),
Field::new("r_comment", DataType::Utf8, true),
Field::new("r_name", DataType::Utf8View, false),
Field::new("r_comment", DataType::Utf8View, true),
]);
pub static ref PART: Schema = Schema::new(vec![
Field::new("p_partkey", DataType::Int64, false),
Field::new("p_name", DataType::Utf8, false),
Field::new("p_mfgr", DataType::Utf8, false),
Field::new("p_brand", DataType::Utf8, false),
Field::new("p_type", DataType::Utf8, false),
Field::new("p_name", DataType::Utf8View, false),
Field::new("p_mfgr", DataType::Utf8View, false),
Field::new("p_brand", DataType::Utf8View, false),
Field::new("p_type", DataType::Utf8View, false),
Field::new("p_size", DataType::Int32, false),
Field::new("p_container", DataType::Utf8, false),
Field::new("p_container", DataType::Utf8View, false),
Field::new("p_retailprice", DataType::Float64, false),
Field::new("p_comment", DataType::Utf8, false),
Field::new("p_comment", DataType::Utf8View, false),
]);
pub static ref SUPPLIER: Schema = Schema::new(vec![
Field::new("s_suppkey", DataType::Int64, false),
Field::new("s_name", DataType::Utf8, false),
Field::new("s_address", DataType::Utf8, false),
Field::new("s_name", DataType::Utf8View, false),
Field::new("s_address", DataType::Utf8View, false),
Field::new("s_nationkey", DataType::Int32, false),
Field::new("s_phone", DataType::Utf8, false),
Field::new("s_phone", DataType::Utf8View, false),
Field::new("s_acctbal", DataType::Float64, false),
Field::new("s_comment", DataType::Utf8, false),
Field::new("s_comment", DataType::Utf8View, false),
]);
pub static ref PARTSUPP: Schema = Schema::new(vec![
Field::new("ps_partkey", DataType::Int64, false),
Field::new("ps_suppkey", DataType::Int64, false),
Field::new("ps_availqty", DataType::Int64, false),
Field::new("ps_supplycost", DataType::Float64, false),
Field::new("ps_comment", DataType::Utf8, false),
Field::new("ps_comment", DataType::Utf8View, false),
]);
pub static ref CUSTOMER: Schema = Schema::new(vec![
Field::new("c_custkey", DataType::Int64, false),
Field::new("c_name", DataType::Utf8, false),
Field::new("c_address", DataType::Utf8, false),
Field::new("c_name", DataType::Utf8View, false),
Field::new("c_address", DataType::Utf8View, false),
Field::new("c_nationkey", DataType::Int64, false),
Field::new("c_phone", DataType::Utf8, false),
Field::new("c_phone", DataType::Utf8View, false),
Field::new("c_acctbal", DataType::Float64, false),
Field::new("c_mktsegment", DataType::Utf8, false),
Field::new("c_comment", DataType::Utf8, false),
Field::new("c_mktsegment", DataType::Utf8View, false),
Field::new("c_comment", DataType::Utf8View, false),
]);
pub static ref ORDERS: Schema = Schema::new(vec![
Field::new("o_orderkey", DataType::Int64, false),
Field::new("o_custkey", DataType::Int64, false),
Field::new("o_orderstatus", DataType::Utf8, false),
Field::new("o_orderstatus", DataType::Utf8View, false),
Field::new("o_totalprice", DataType::Float64, false),
Field::new("o_orderdate", DataType::Date32, false),
Field::new("o_orderpriority", DataType::Utf8, false),
Field::new("o_clerk", DataType::Utf8, false),
Field::new("o_orderpriority", DataType::Utf8View, false),
Field::new("o_clerk", DataType::Utf8View, false),
Field::new("o_shippriority", DataType::Int32, false),
Field::new("o_comment", DataType::Utf8, false),
Field::new("o_comment", DataType::Utf8View, false),
]);
pub static ref LINEITEM: Schema = Schema::new(vec![
Field::new("l_orderkey", DataType::Int64, false),
Expand All @@ -73,13 +73,13 @@ lazy_static! {
Field::new("l_extendedprice", DataType::Float64, false),
Field::new("l_discount", DataType::Float64, false),
Field::new("l_tax", DataType::Float64, false),
Field::new("l_returnflag", DataType::Utf8, false),
Field::new("l_linestatus", DataType::Utf8, false),
Field::new("l_returnflag", DataType::Utf8View, false),
Field::new("l_linestatus", DataType::Utf8View, false),
Field::new("l_shipdate", DataType::Date32, false),
Field::new("l_commitdate", DataType::Date32, false),
Field::new("l_receiptdate", DataType::Date32, false),
Field::new("l_shipinstruct", DataType::Utf8, false),
Field::new("l_shipmode", DataType::Utf8, false),
Field::new("l_comment", DataType::Utf8, false),
Field::new("l_shipinstruct", DataType::Utf8View, false),
Field::new("l_shipmode", DataType::Utf8View, false),
Field::new("l_comment", DataType::Utf8View, false),
]);
}
9 changes: 8 additions & 1 deletion encodings/dict/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use hashbrown::hash_map::{Entry, RawEntryMut};
use hashbrown::HashMap;
use num_traits::AsPrimitive;
use vortex::accessor::ArrayAccessor;
use vortex::array::{PrimitiveArray, VarBinArray};
use vortex::array::{PrimitiveArray, VarBinArray, VarBinViewArray};
use vortex::validity::Validity;
use vortex::{ArrayDType, IntoArray};
use vortex_dtype::{match_each_native_ptype, DType, NativePType, ToBytes};
Expand Down Expand Up @@ -89,6 +89,13 @@ pub fn dict_encode_varbin(array: &VarBinArray) -> (PrimitiveArray, VarBinArray)
.unwrap()
}

/// Dictionary encode a VarbinViewArray.
pub fn dict_encode_varbinview(array: &VarBinViewArray) -> (PrimitiveArray, VarBinArray) {
array
.with_iterator(|iter| dict_encode_typed_varbin(array.dtype().clone(), iter))
.unwrap()
}

fn lookup_bytes<'a, T: NativePType + AsPrimitive<usize>>(
offsets: &'a [T],
bytes: &'a [u8],
Expand Down
40 changes: 25 additions & 15 deletions encodings/dict/src/compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ impl SliceFn for DictArray {

#[cfg(test)]
mod test {
use vortex::array::{PrimitiveArray, VarBinArray};
use vortex::accessor::ArrayAccessor;
use vortex::array::{PrimitiveArray, VarBinViewArray};
use vortex::{IntoArray, IntoArrayVariant, ToArray};
use vortex_dtype::{DType, Nullability};

use crate::{dict_encode_typed_primitive, dict_encode_varbin, DictArray};
use crate::{dict_encode_typed_primitive, dict_encode_varbinview, DictArray};

#[test]
fn flatten_nullable_primitive() {
Expand All @@ -79,20 +79,30 @@ mod test {

#[test]
fn flatten_nullable_varbin() {
let reference = VarBinArray::from_iter(
vec![Some("a"), Some("b"), None, Some("a"), None, Some("b")],
DType::Utf8(Nullability::Nullable),
);
let (codes, values) = dict_encode_varbin(&reference);
let reference = VarBinViewArray::from_iter_nullable_str(vec![
Some("a"),
Some("b"),
None,
Some("a"),
None,
Some("b"),
]);
assert_eq!(reference.len(), 6);
let (codes, values) = dict_encode_varbinview(&reference);
let dict = DictArray::try_new(codes.into_array(), values.into_array()).unwrap();
let flattened_dict = dict.to_array().into_varbin().unwrap();
assert_eq!(
flattened_dict.offsets().into_primitive().unwrap().buffer(),
reference.offsets().into_primitive().unwrap().buffer()
);
let flattened_dict = dict.to_array().into_varbinview().unwrap();

assert_eq!(
flattened_dict.bytes().into_primitive().unwrap().buffer(),
reference.bytes().into_primitive().unwrap().buffer()
flattened_dict
.with_iterator(|iter| iter
.map(|slice| slice.map(|s| s.to_vec()))
.collect::<Vec<_>>())
.unwrap(),
reference
.with_iterator(|iter| iter
.map(|slice| slice.map(|s| s.to_vec()))
.collect::<Vec<_>>())
.unwrap(),
);
}
}
29 changes: 10 additions & 19 deletions encodings/fsst/src/canonical.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
use arrow_array::builder::GenericByteBuilder;
use arrow_array::types::BinaryType;
use fsst::Symbol;
use vortex::array::VarBinArray;
use arrow_array::builder::BinaryViewBuilder;
use vortex::array::VarBinViewArray;
use vortex::arrow::FromArrowArray;
use vortex::validity::ArrayValidity;
use vortex::{ArrayDType, Canonical, IntoCanonical};
Expand All @@ -12,17 +10,10 @@ use crate::FSSTArray;
impl IntoCanonical for FSSTArray {
fn into_canonical(self) -> VortexResult<Canonical> {
self.with_decompressor(|decompressor| {
// Note: the maximum amount of decompressed space for an FSST array is 8 * n_elements,
// as each code can expand into a symbol of 1-8 bytes.
let max_items = self.len();
let max_bytes = self.codes().nbytes() * size_of::<Symbol>();

// Create the target Arrow binary array
// TODO(aduffy): switch to BinaryView when PR https://github.com/spiraldb/vortex/pull/476 merges
let mut builder = GenericByteBuilder::<BinaryType>::with_capacity(max_items, max_bytes);
let mut builder = BinaryViewBuilder::with_capacity(self.len());

// TODO(aduffy): add decompression functions that support writing directly into and output buffer.
let codes_array = self.codes().into_canonical()?.into_varbin()?;
let codes_array = self.codes().into_canonical()?.into_varbinview()?;

// TODO(aduffy): make this loop faster.
for idx in 0..self.len() {
Expand All @@ -37,20 +28,20 @@ impl IntoCanonical for FSSTArray {

let arrow_array = builder.finish();

// Force the DTYpe
let canonical_varbin = VarBinArray::try_from(&vortex::Array::from_arrow(
// Force the DType
let canonical_varbin = VarBinViewArray::try_from(&vortex::Array::from_arrow(
&arrow_array,
self.dtype().is_nullable(),
))?;

let forced_dtype = VarBinArray::try_new(
canonical_varbin.offsets(),
canonical_varbin.bytes(),
let forced_dtype = VarBinViewArray::try_new(
canonical_varbin.views(),
canonical_varbin.buffers().collect(),
self.dtype().clone(),
canonical_varbin.validity(),
)?;

Ok(Canonical::VarBin(forced_dtype))
Ok(Canonical::VarBinView(forced_dtype))
})
}
}
2 changes: 1 addition & 1 deletion pyvortex/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ build-backend = "maturin"
[tool.rye]
managed = true
dev-dependencies = [
"pyarrow>=15.0.0",
"pyarrow>=17.0.0",
"pip"
]

Expand Down
8 changes: 4 additions & 4 deletions pyvortex/test/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,22 @@ def test_primitive_array_round_trip():


def test_array_with_nulls():
a = pa.array([b"123", None])
a = pa.array([b"123", None], type=pa.string_view())
arr = vortex.encode(a)
assert arr.to_arrow().combine_chunks() == a


def test_varbin_array_round_trip():
a = pa.array(["a", "b", "c"])
a = pa.array(["a", "b", "c"], type=pa.string_view())
arr = vortex.encode(a)
assert arr.to_arrow().combine_chunks() == a


def test_varbin_array_take():
def test_varbinview_array_take():
a = vortex.encode(pa.array(["a", "b", "c", "d"]))
assert a.take(vortex.encode(pa.array([0, 2]))).to_arrow().combine_chunks() == pa.array(
["a", "c"],
type=pa.utf8(),
type=pa.string_view(),
)


Expand Down
4 changes: 2 additions & 2 deletions pyvortex/test/test_compress.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ def test_table_encode():
table = pa.table(
{
"number": pa.chunked_array([pa.array([0, 1, 2]), pa.array([3, 4, 5])]),
"string": pa.chunked_array([pa.array(["a", "b", "c"]), pa.array(["d", "e", "f"])]),
"string": pa.chunked_array([pa.array(["a", "b", "c"], type=pa.string_view()), pa.array(["d", "e", "f"], type=pa.string_view())]),
}
)
encoded = vortex.encode(table)
assert encoded.to_arrow().combine_chunks() == pa.StructArray.from_arrays(
[pa.array([0, 1, 2, 3, 4, 5]), pa.array(["a", "b", "c", "d", "e", "f"])], names=["number", "string"]
[pa.array([0, 1, 2, 3, 4, 5]), pa.array(["a", "b", "c", "d", "e", "f"], type=pa.string_view())], names=["number", "string"]
)


Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ pluggy==1.4.0
# via pytest
py-cpuinfo==9.0.0
# via pytest-benchmark
pyarrow==15.0.2
pyarrow==17.0.0
pygments==2.17.2
# via mkdocs-material
pymdown-extensions==10.7.1
Expand Down
1 change: 1 addition & 0 deletions vortex-array/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ paste = { workspace = true }
pin-project = { workspace = true }
rand = { workspace = true }
serde = { workspace = true, features = ["derive"] }
static_assertions = { workspace = true }
vortex-buffer = { workspace = true }
vortex-datetime-dtype = { workspace = true }
vortex-dtype = { workspace = true }
Expand Down
Loading