arrow2_convert primitive (de)serialization benchmarks (#1742)

teh-cmc · web-flow · commit 1e84aa5f718f · 2023-03-31T17:43:57.000+02:00
* arrow2_convert primitive benchmarks

* addressing PR comments
diff --git a/crates/re_arrow_store/Cargo.toml b/crates/re_arrow_store/Cargo.toml
@@ -111,3 +111,7 @@ required-features = ["polars"]
 [[bench]]
 name = "data_store"
 harness = false
+
+[[bench]]
+name = "arrow2_convert"
+harness = false
diff --git a/crates/re_arrow_store/benches/arrow2_convert.rs b/crates/re_arrow_store/benches/arrow2_convert.rs
@@ -0,0 +1,141 @@
+//! Keeping track of performance issues/regressions in `arrow2_convert` that directly affect us.
+
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
+use arrow2::{array::PrimitiveArray, datatypes::PhysicalType, types::PrimitiveType};
+use criterion::{criterion_group, criterion_main, Criterion};
+use re_log_types::{
+    component_types::InstanceKey, external::arrow2_convert::deserialize::TryIntoCollection,
+    Component as _, DataCell,
+};
+
+// ---
+
+criterion_group!(benches, serialize, deserialize);
+criterion_main!(benches);
+
+// ---
+
+#[cfg(not(debug_assertions))]
+const NUM_INSTANCES: usize = 100_000;
+
+// `cargo test` also runs the benchmark setup code, so make sure they run quickly:
+#[cfg(debug_assertions)]
+const NUM_INSTANCES: usize = 1;
+
+// ---
+
+fn serialize(c: &mut Criterion) {
+    let mut group = c.benchmark_group(format!(
+        "arrow2_convert/serialize/primitive/instances={NUM_INSTANCES}"
+    ));
+    group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _));
+
+    {
+        group.bench_function("arrow2_convert", |b| {
+            b.iter(|| {
+                let cell = DataCell::from_component::<InstanceKey>(0..NUM_INSTANCES as u64);
+                assert_eq!(NUM_INSTANCES as u32, cell.num_instances());
+                assert_eq!(
+                    cell.datatype().to_physical_type(),
+                    PhysicalType::Primitive(PrimitiveType::UInt64)
+                );
+                cell
+            });
+        });
+    }
+
+    {
+        group.bench_function("arrow2/from_values", |b| {
+            b.iter(|| {
+                let values = PrimitiveArray::from_values(0..NUM_INSTANCES as u64).boxed();
+                let cell = crate::DataCell::from_arrow(InstanceKey::name(), values);
+                assert_eq!(NUM_INSTANCES as u32, cell.num_instances());
+                assert_eq!(
+                    cell.datatype().to_physical_type(),
+                    PhysicalType::Primitive(PrimitiveType::UInt64)
+                );
+                cell
+            });
+        });
+    }
+
+    {
+        group.bench_function("arrow2/from_vec", |b| {
+            b.iter(|| {
+                // NOTE: We do the `collect()` here on purpose!
+                //
+                // All of these APIs have to allocate an array under the hood, except `from_vec`
+                // which is O(1) (it just unsafely reuses the vec's data pointer).
+                // We need to measure the collection in order to have a leveled playing field.
+                let values = PrimitiveArray::from_vec((0..NUM_INSTANCES as u64).collect()).boxed();
+                let cell = crate::DataCell::from_arrow(InstanceKey::name(), values);
+                assert_eq!(NUM_INSTANCES as u32, cell.num_instances());
+                assert_eq!(
+                    cell.datatype().to_physical_type(),
+                    PhysicalType::Primitive(PrimitiveType::UInt64)
+                );
+                cell
+            });
+        });
+    }
+}
+
+fn deserialize(c: &mut Criterion) {
+    let mut group = c.benchmark_group(format!(
+        "arrow2_convert/deserialize/primitive/instances={NUM_INSTANCES}"
+    ));
+    group.throughput(criterion::Throughput::Elements(NUM_INSTANCES as _));
+
+    let cell = DataCell::from_component::<InstanceKey>(0..NUM_INSTANCES as u64);
+    let data = cell.as_arrow();
+
+    {
+        group.bench_function("arrow2_convert", |b| {
+            b.iter(|| {
+                let keys: Vec<InstanceKey> = data.as_ref().try_into_collection().unwrap();
+                assert_eq!(NUM_INSTANCES, keys.len());
+                assert_eq!(
+                    InstanceKey(NUM_INSTANCES as u64 / 2),
+                    keys[NUM_INSTANCES / 2]
+                );
+                keys
+            });
+        });
+    }
+
+    {
+        group.bench_function("arrow2/validity_checks", |b| {
+            b.iter(|| {
+                let data = data.as_any().downcast_ref::<PrimitiveArray<u64>>().unwrap();
+                let keys: Vec<InstanceKey> = data
+                    .into_iter()
+                    .filter_map(|v| v.copied().map(InstanceKey))
+                    .collect();
+                assert_eq!(NUM_INSTANCES, keys.len());
+                assert_eq!(
+                    InstanceKey(NUM_INSTANCES as u64 / 2),
+                    keys[NUM_INSTANCES / 2]
+                );
+                keys
+            });
+        });
+    }
+
+    {
+        group.bench_function("arrow2/validity_bypass", |b| {
+            b.iter(|| {
+                let data = data.as_any().downcast_ref::<PrimitiveArray<u64>>().unwrap();
+                assert!(data.validity().is_none());
+                let keys: Vec<InstanceKey> = data.values_iter().copied().map(InstanceKey).collect();
+                assert_eq!(NUM_INSTANCES, keys.len());
+                assert_eq!(
+                    InstanceKey(NUM_INSTANCES as u64 / 2),
+                    keys[NUM_INSTANCES / 2]
+                );
+                keys
+            });
+        });
+    }
+}