From f7f9fd13736ac839b005e1b84dbbaa78ba117441 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sun, 19 Oct 2025 19:47:35 +0300 Subject: [PATCH] bench: create `zip` kernel benchmarks --- arrow/Cargo.toml | 5 + arrow/benches/zip_kernels.rs | 279 +++++++++++++++++++++++++++++++++++ arrow/src/util/bench_util.rs | 83 ++++++++++- 3 files changed, 364 insertions(+), 3 deletions(-) create mode 100644 arrow/benches/zip_kernels.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index c77e85861d51..743628c8c7d1 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -177,6 +177,11 @@ name = "interleave_kernels" harness = false required-features = ["test_utils"] +[[bench]] +name = "zip_kernels" +harness = false +required-features = ["test_utils"] + [[bench]] name = "length_kernel" harness = false diff --git a/arrow/benches/zip_kernels.rs b/arrow/benches/zip_kernels.rs new file mode 100644 index 000000000000..5ec9f107d302 --- /dev/null +++ b/arrow/benches/zip_kernels.rs @@ -0,0 +1,279 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use criterion::measurement::WallTime; +use criterion::{BenchmarkGroup, BenchmarkId, Criterion, criterion_group, criterion_main}; +use rand::distr::{Distribution, StandardUniform}; +use rand::prelude::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint; +use std::sync::Arc; + +use arrow::array::*; +use arrow::datatypes::*; +use arrow::util::bench_util::*; +use arrow_select::zip::zip; + +trait InputGenerator { + fn name(&self) -> &str; + + /// Return an ArrayRef containing a single null value + fn generate_scalar_with_null_value(&self) -> ArrayRef; + + /// Generate a `number_of_scalars` unique scalars + fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) -> Vec; + + /// Generate array with specified length and null percentage + fn generate_array(&self, seed: u64, array_length: usize, null_percentage: f32) -> ArrayRef; +} + +struct GeneratePrimitive { + description: String, + _marker: std::marker::PhantomData, +} + +impl InputGenerator for GeneratePrimitive +where + T: ArrowPrimitiveType, + StandardUniform: Distribution, +{ + fn name(&self) -> &str { + self.description.as_str() + } + + fn generate_scalar_with_null_value(&self) -> ArrayRef { + new_null_array(&T::DATA_TYPE, 1) + } + + fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) -> Vec { + let rng = StdRng::seed_from_u64(seed); + + rng.sample_iter::(StandardUniform) + .take(number_of_scalars) + .map(|v: T::Native| { + Arc::new(PrimitiveArray::::new_scalar(v).into_inner()) as ArrayRef + }) + .collect() + } + + fn generate_array(&self, seed: u64, array_length: usize, null_percentage: f32) -> ArrayRef { + Arc::new(create_primitive_array_with_seed::( + array_length, + null_percentage, + seed, + )) + } +} + +struct GenerateBytes { + range_length: std::ops::Range, + description: String, + + _marker: std::marker::PhantomData, +} + +impl InputGenerator for GenerateBytes +where + Byte: ByteArrayType, +{ + fn name(&self) -> &str { + self.description.as_str() + } + + fn generate_scalar_with_null_value(&self) -> ArrayRef { + new_null_array(&Byte::DATA_TYPE, 1) + } + + fn generate_non_null_scalars(&self, seed: u64, number_of_scalars: usize) -> Vec { + let array = self.generate_array(seed, number_of_scalars, 0.0); + + (0..number_of_scalars).map(|i| array.slice(i, 1)).collect() + } + + fn generate_array(&self, seed: u64, array_length: usize, null_percentage: f32) -> ArrayRef { + let is_binary = + Byte::DATA_TYPE == DataType::Binary || Byte::DATA_TYPE == DataType::LargeBinary; + if is_binary { + Arc::new(create_binary_array_with_len_range_and_prefix_and_seed::< + Byte::Offset, + >( + array_length, + null_percentage, + self.range_length.start, + self.range_length.end - 1, + &[], + seed, + )) + } else { + Arc::new(create_string_array_with_len_range_and_prefix_and_seed::< + Byte::Offset, + >( + array_length, + null_percentage, + self.range_length.start, + self.range_length.end - 1, + "", + seed, + )) + } + } +} + +fn mask_cases(len: usize) -> Vec<(&'static str, BooleanArray)> { + vec![ + ("all_true", create_boolean_array(len, 0.0, 1.0)), + ("99pct_true", create_boolean_array(len, 0.0, 0.99)), + ("90pct_true", create_boolean_array(len, 0.0, 0.9)), + ("50pct_true", create_boolean_array(len, 0.0, 0.5)), + ("10pct_true", create_boolean_array(len, 0.0, 0.1)), + ("1pct_true", create_boolean_array(len, 0.0, 0.01)), + ("all_false", create_boolean_array(len, 0.0, 0.0)), + ("50pct_nulls", create_boolean_array(len, 0.5, 0.5)), + ] +} + +fn bench_zip_on_input_generator(c: &mut Criterion, input_generator: &impl InputGenerator) { + const ARRAY_LEN: usize = 8192; + + let mut group = + c.benchmark_group(format!("zip_{ARRAY_LEN}_from_{}", input_generator.name()).as_str()); + + let null_scalar = input_generator.generate_scalar_with_null_value(); + let [non_null_scalar_1, non_null_scalar_2]: [_; 2] = input_generator + .generate_non_null_scalars(42, 2) + .try_into() + .unwrap(); + + let array_1_10pct_nulls = input_generator.generate_array(42, ARRAY_LEN, 0.1); + let array_2_10pct_nulls = input_generator.generate_array(18, ARRAY_LEN, 0.1); + + let masks = mask_cases(ARRAY_LEN); + + // Benchmarks for different scalar combinations + for (description, truthy, falsy) in &[ + ("null_vs_non_null_scalar", &null_scalar, &non_null_scalar_1), + ( + "non_null_scalar_vs_null_scalar", + &non_null_scalar_1, + &null_scalar, + ), + ("non_nulls_scalars", &non_null_scalar_1, &non_null_scalar_2), + ] { + bench_zip_input_on_all_masks( + description, + &mut group, + &masks, + &Scalar::new(truthy), + &Scalar::new(falsy), + ); + } + + bench_zip_input_on_all_masks( + "array_vs_non_null_scalar", + &mut group, + &masks, + &array_1_10pct_nulls, + &non_null_scalar_1, + ); + + bench_zip_input_on_all_masks( + "non_null_scalar_vs_array", + &mut group, + &masks, + &array_1_10pct_nulls, + &non_null_scalar_1, + ); + + bench_zip_input_on_all_masks( + "array_vs_array", + &mut group, + &masks, + &array_1_10pct_nulls, + &array_2_10pct_nulls, + ); + + group.finish(); +} + +fn bench_zip_input_on_all_masks( + description: &str, + group: &mut BenchmarkGroup, + masks: &[(&str, BooleanArray)], + truthy: &impl Datum, + falsy: &impl Datum, +) { + for (mask_description, mask) in masks { + let id = BenchmarkId::new(description, mask_description); + group.bench_with_input(id, mask, |b, mask| { + b.iter(|| hint::black_box(zip(mask, truthy, falsy))) + }); + } +} + +fn add_benchmark(c: &mut Criterion) { + // Primitive + bench_zip_on_input_generator( + c, + &GeneratePrimitive:: { + description: "i32".to_string(), + _marker: std::marker::PhantomData, + }, + ); + + // Short strings + bench_zip_on_input_generator( + c, + &GenerateBytes::> { + description: "short strings (3..10)".to_string(), + range_length: 3..10, + _marker: std::marker::PhantomData, + }, + ); + + // Long strings + bench_zip_on_input_generator( + c, + &GenerateBytes::> { + description: "long strings (100..400)".to_string(), + range_length: 100..400, + _marker: std::marker::PhantomData, + }, + ); + + // Short Bytes + bench_zip_on_input_generator( + c, + &GenerateBytes::> { + description: "short bytes (3..10)".to_string(), + range_length: 3..10, + _marker: std::marker::PhantomData, + }, + ); + + // Long Bytes + bench_zip_on_input_generator( + c, + &GenerateBytes::> { + description: "long bytes (100..400)".to_string(), + range_length: 100..400, + _marker: std::marker::PhantomData, + }, + ); +} + +criterion_group!(benches, add_benchmark); +criterion_main!(benches); diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs index 4bd648bc40ad..d85eb4aafdc3 100644 --- a/arrow/src/util/bench_util.rs +++ b/arrow/src/util/bench_util.rs @@ -155,6 +155,27 @@ fn create_string_array_with_len_range_and_prefix( min_str_len: usize, max_str_len: usize, prefix: &str, +) -> GenericStringArray { + create_string_array_with_len_range_and_prefix_and_seed( + size, + null_density, + min_str_len, + max_str_len, + prefix, + 42, + ) +} + +/// Creates a random [`GenericStringArray`] of a given `size` and `null_density` +/// filling it with random strings with lengths in the specified range, +/// all starting with the provided `prefix`, generated using the provided `seed`. +pub fn create_string_array_with_len_range_and_prefix_and_seed( + size: usize, + null_density: f32, + min_str_len: usize, + max_str_len: usize, + prefix: &str, + seed: u64, ) -> GenericStringArray { assert!( min_str_len <= max_str_len, @@ -165,7 +186,7 @@ fn create_string_array_with_len_range_and_prefix( "Prefix length must be <= max_str_len" ); - let rng = &mut seedable_rng(); + let rng = &mut StdRng::seed_from_u64(seed); (0..size) .map(|_| { if rng.random::() < null_density { @@ -449,8 +470,29 @@ pub fn create_binary_array( size: usize, null_density: f32, ) -> GenericBinaryArray { - let rng = &mut seedable_rng(); - let range_rng = &mut seedable_rng(); + create_binary_array_with_seed( + size, + null_density, + 42, // bytes_seed + 42, // bytes_length_seed + ) +} + +/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density` +/// filling it with random bytes, generated using the provided `seed`s. +/// +/// the `bytes_seed` is used to seed the RNG for generating the byte values, +/// while the `bytes_length_seed` is used to seed the RNG for generating the length of an array item +/// +/// These values can be the same as they are used to seed different RNGs internally. +pub fn create_binary_array_with_seed( + size: usize, + null_density: f32, + bytes_seed: u64, + bytes_length_seed: u64, +) -> GenericBinaryArray { + let rng = &mut StdRng::seed_from_u64(bytes_seed); + let range_rng = &mut StdRng::seed_from_u64(bytes_length_seed); (0..size) .map(|_| { @@ -467,6 +509,41 @@ pub fn create_binary_array( .collect() } +/// Creates a random [`GenericBinaryArray`] of a given `size` and `null_density` +/// filling it with random bytes with lengths in the specified range, +/// all starting with the provided `prefix`, generated using the provided `seed`. +/// +pub fn create_binary_array_with_len_range_and_prefix_and_seed( + size: usize, + null_density: f32, + min_len: usize, + max_len: usize, + prefix: &[u8], + seed: u64, +) -> GenericBinaryArray { + assert!(min_len <= max_len, "min_len must be <= max_len"); + assert!(prefix.len() <= max_len, "Prefix length must be <= max_len"); + + let rng = &mut StdRng::seed_from_u64(seed); + (0..size) + .map(|_| { + if rng.random::() < null_density { + None + } else { + let remaining_len = rng + .random_range(min_len.saturating_sub(prefix.len())..=(max_len - prefix.len())); + + let remaining = rng + .sample_iter::(StandardUniform) + .take(remaining_len); + + let value = prefix.iter().copied().chain(remaining).collect::>(); + Some(value) + } + }) + .collect() +} + /// Creates an random (but fixed-seeded) array of a given size and null density pub fn create_fsb_array(size: usize, null_density: f32, value_len: usize) -> FixedSizeBinaryArray { let rng = &mut seedable_rng();