Skip to content

Commit e6cb61f

Browse files
Dandandanalamb
andauthored
Speedup sorting for inline views: 1.4x - 1.7x improvement (#7856)
# Which issue does this PR close? - Closes #7857 # Rationale for this change ``` sort string_view[0-400] nulls to indices 2^12 1.00 45.2±1.37µs ? ?/sec 1.01 45.8±1.74µs ? ?/sec sort string_view[0-400] to indices 2^12 1.00 69.1±1.98µs ? ?/sec 1.00 69.1±4.24µs ? ?/sec sort string_view[10] nulls to indices 2^12 1.00 40.8±1.81µs ? ?/sec 1.37 55.7±3.90µs ? ?/sec sort string_view[10] to indices 2^12 1.00 52.8±0.35µs ? ?/sec 1.63 85.9±1.46µs ? ?/sec sort string_view_inlined[0-12] nulls to indices 2^12 1.00 40.9±1.99µs ? ?/sec 1.29 52.6±1.76µs ? ?/sec sort string_view_inlined[0-12] to indices 2^12 1.00 50.6±0.27µs ? ?/sec 1.68 85.0±12.24µs ? ?/sec ``` # What changes are included in this PR? Speedup by specializing on batches with only inline views. # Are these changes tested?, are they covered by existing tests)? existing tests # Are there any user-facing changes? no --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 81ab147 commit e6cb61f

File tree

1 file changed

+60
-40
lines changed

1 file changed

+60
-40
lines changed

arrow-ord/src/sort.rs

Lines changed: 60 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -311,51 +311,71 @@ fn sort_byte_view<T: ByteViewType>(
311311
limit: Option<usize>,
312312
) -> UInt32Array {
313313
// 1. Build a list of (index, raw_view, length)
314-
let mut valids: Vec<_> = value_indices
315-
.into_iter()
316-
.map(|idx| {
317-
// SAFETY: we know idx < values.len()
318-
let raw = unsafe { *values.views().get_unchecked(idx as usize) };
319-
let len = raw as u32; // lower 32 bits encode length
320-
(idx, raw, len)
321-
})
322-
.collect();
323-
314+
let mut valids: Vec<_>;
324315
// 2. Compute the number of non-null entries to partially sort
325-
let vlimit = match (limit, options.nulls_first) {
326-
(Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()),
327-
_ => valids.len(),
316+
let vlimit: usize = match (limit, options.nulls_first) {
317+
(Some(l), true) => l.saturating_sub(nulls.len()).min(value_indices.len()),
318+
_ => value_indices.len(),
328319
};
320+
// 3.a Check if all views are inline (no data buffers)
321+
if values.data_buffers().is_empty() {
322+
valids = value_indices
323+
.into_iter()
324+
.map(|idx| {
325+
// SAFETY: we know idx < values.len()
326+
let raw = unsafe { *values.views().get_unchecked(idx as usize) };
327+
let inline_key = GenericByteViewArray::<T>::inline_key_fast(raw);
328+
(idx, inline_key)
329+
})
330+
.collect();
331+
let cmp_inline = |a: &(u32, u128), b: &(u32, u128)| a.1.cmp(&b.1);
329332

330-
// 3. Mixed comparator: first prefix, then inline vs full comparison
331-
let cmp_mixed = |a: &(u32, u128, u32), b: &(u32, u128, u32)| {
332-
let (_, raw_a, len_a) = *a;
333-
let (_, raw_b, len_b) = *b;
334-
335-
// 3.1 Both inline (≤12 bytes): compare full 128-bit key including length
336-
if len_a <= MAX_INLINE_VIEW_LEN && len_b <= MAX_INLINE_VIEW_LEN {
337-
return GenericByteViewArray::<T>::inline_key_fast(raw_a)
338-
.cmp(&GenericByteViewArray::<T>::inline_key_fast(raw_b));
333+
// Partially sort according to ascending/descending
334+
if !options.descending {
335+
sort_unstable_by(&mut valids, vlimit, cmp_inline);
336+
} else {
337+
sort_unstable_by(&mut valids, vlimit, |x, y| cmp_inline(x, y).reverse());
339338
}
339+
} else {
340+
valids = value_indices
341+
.into_iter()
342+
.map(|idx| {
343+
// SAFETY: we know idx < values.len()
344+
let raw = unsafe { *values.views().get_unchecked(idx as usize) };
345+
(idx, raw)
346+
})
347+
.collect();
348+
// 3.b Mixed comparator: first prefix, then inline vs full comparison
349+
let cmp_mixed = |a: &(u32, u128), b: &(u32, u128)| {
350+
let (_, raw_a) = *a;
351+
let (_, raw_b) = *b;
352+
let len_a = raw_a as u32;
353+
let len_b = raw_b as u32;
354+
// 3.b.1 Both inline (≤12 bytes): compare full 128-bit key including length
355+
if len_a <= MAX_INLINE_VIEW_LEN && len_b <= MAX_INLINE_VIEW_LEN {
356+
return GenericByteViewArray::<T>::inline_key_fast(raw_a)
357+
.cmp(&GenericByteViewArray::<T>::inline_key_fast(raw_b));
358+
}
340359

341-
// 3.2 Compare 4-byte prefix in big-endian order
342-
let pref_a = ByteView::from(raw_a).prefix.swap_bytes();
343-
let pref_b = ByteView::from(raw_b).prefix.swap_bytes();
344-
if pref_a != pref_b {
345-
return pref_a.cmp(&pref_b);
346-
}
360+
// 3.b.2 Compare 4-byte prefix in big-endian order
361+
let pref_a = ByteView::from(raw_a).prefix.swap_bytes();
362+
let pref_b = ByteView::from(raw_b).prefix.swap_bytes();
363+
if pref_a != pref_b {
364+
return pref_a.cmp(&pref_b);
365+
}
347366

348-
// 3.3 Fallback to full byte-slice comparison
349-
let full_a: &[u8] = unsafe { values.value_unchecked(a.0 as usize).as_ref() };
350-
let full_b: &[u8] = unsafe { values.value_unchecked(b.0 as usize).as_ref() };
351-
full_a.cmp(full_b)
352-
};
367+
// 3.b.3 Fallback to full byte-slice comparison
368+
let full_a: &[u8] = unsafe { values.value_unchecked(a.0 as usize).as_ref() };
369+
let full_b: &[u8] = unsafe { values.value_unchecked(b.0 as usize).as_ref() };
370+
full_a.cmp(full_b)
371+
};
353372

354-
// 4. Partially sort according to ascending/descending
355-
if !options.descending {
356-
sort_unstable_by(&mut valids, vlimit, cmp_mixed);
357-
} else {
358-
sort_unstable_by(&mut valids, vlimit, |x, y| cmp_mixed(x, y).reverse());
373+
// 3.b.4 Partially sort according to ascending/descending
374+
if !options.descending {
375+
sort_unstable_by(&mut valids, vlimit, cmp_mixed);
376+
} else {
377+
sort_unstable_by(&mut valids, vlimit, |x, y| cmp_mixed(x, y).reverse());
378+
}
359379
}
360380

361381
// 5. Assemble nulls and sorted indices into final output
@@ -367,10 +387,10 @@ fn sort_byte_view<T: ByteViewType>(
367387
// Place null indices first
368388
out.extend_from_slice(&nulls[..nulls.len().min(out_limit)]);
369389
let rem = out_limit - out.len();
370-
out.extend(valids.iter().map(|&(i, _, _)| i).take(rem));
390+
out.extend(valids.iter().map(|&(i, _)| i).take(rem));
371391
} else {
372392
// Place non-null indices first
373-
out.extend(valids.iter().map(|&(i, _, _)| i).take(out_limit));
393+
out.extend(valids.iter().map(|&(i, _)| i).take(out_limit));
374394
let rem = out_limit - out.len();
375395
out.extend_from_slice(&nulls[..rem]);
376396
}

0 commit comments

Comments
 (0)