Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 118 additions & 1 deletion arrow-string/src/like.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1708,7 +1708,93 @@ mod tests {
}

#[test]
fn like_scalar_null() {
fn string_null_like_pattern() {
// Different patterns have different execution code paths
for pattern in &[
"", // can execute as equality check
"_", // can execute as length check
"%", // can execute as starts_with("") or non-null check
"a%", // can execute as starts_with("a")
"%a", // can execute as ends_with("")
"a%b", // can execute as starts_with("a") && ends_with("b")
"%a%", // can_execute as contains("a")
"%a%b_c_d%e", // can_execute as regular expression
] {
let a = Scalar::new(StringArray::new_null(1));
let b = StringArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = Scalar::new(StringArray::new_null(1));
let b = StringArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringArray::new_null(1);
let b = StringArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringArray::new_null(1);
let b = StringArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");
}
}

#[test]
fn string_view_null_like_pattern() {
// Different patterns have different execution code paths
for pattern in &[
"", // can execute as equality check
"_", // can execute as length check
"%", // can execute as starts_with("") or non-null check
"a%", // can execute as starts_with("a")
"%a", // can execute as ends_with("")
"a%b", // can execute as starts_with("a") && ends_with("b")
"%a%", // can_execute as contains("a")
"%a%b_c_d%e", // can_execute as regular expression
] {
let a = Scalar::new(StringViewArray::new_null(1));
let b = StringViewArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = Scalar::new(StringViewArray::new_null(1));
let b = StringViewArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringViewArray::new_null(1);
let b = StringViewArray::from_iter_values([pattern]);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");

let a = StringViewArray::new_null(1);
let b = StringViewArray::new_scalar(pattern);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1, "With pattern {pattern}");
assert_eq!(r.null_count(), 1, "With pattern {pattern}");
assert!(r.is_null(0), "With pattern {pattern}");
}
}

#[test]
fn string_like_scalar_null() {
let a = StringArray::new_scalar("a");
let b = Scalar::new(StringArray::new_null(1));
let r = like(&a, &b).unwrap();
Expand Down Expand Up @@ -1737,4 +1823,35 @@ mod tests {
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));
}

#[test]
fn string_view_like_scalar_null() {
let a = StringViewArray::new_scalar("a");
let b = Scalar::new(StringViewArray::new_null(1));
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));

let a = StringViewArray::from_iter_values(["a"]);
let b = Scalar::new(StringViewArray::new_null(1));
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));

let a = StringViewArray::from_iter_values(["a"]);
let b = StringViewArray::new_null(1);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));

let a = StringViewArray::new_scalar("a");
let b = StringViewArray::new_null(1);
let r = like(&a, &b).unwrap();
assert_eq!(r.len(), 1);
assert_eq!(r.null_count(), 1);
assert!(r.is_null(0));
}
}
29 changes: 23 additions & 6 deletions arrow-string/src/predicate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use arrow_array::{ArrayAccessor, BooleanArray, StringViewArray};
use arrow_array::{Array, ArrayAccessor, BooleanArray, StringViewArray};
use arrow_schema::ArrowError;
use memchr::memchr2;
use memchr::memmem::Finder;
Expand Down Expand Up @@ -116,10 +116,17 @@ impl<'a> Predicate<'a> {
}),
Predicate::Contains(finder) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
let nulls = string_view_array.logical_nulls();
BooleanArray::from(
string_view_array
.bytes_iter()
.map(|haystack| finder.find(haystack).is_some() != negate)
.enumerate()
.map(|(idx, haystack)| {
if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my concern with this approach is that it adds code to the inner loop of the kernel.

We should definitely run benchmarks on this PR before merging

I think another way to solve this would be to either:

  1. Have two loops -- one when nulls was Some and the other when nulls was None (so we aren't checking each row)
  2. Leave the computation as is (don't check for nulls on the input) but instead simply copy the Nulls array from the input to the output (so any input that was null would also be null in the output)

Copy link
Contributor

@tustvold tustvold Nov 1, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FWIW I am not sure why this isn't using from_unary like the other codepaths which would provide option 2, and be significantly more efficient than first collecting into a Vec...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now i am more concerned about correctness. Producing fast but wrong results isn't as useful.
i changed contains to to use from_unary, that seemed trivially good.
@tustvold might this be it doesn't use from_unary because it wants to use prefix_bytes_iter and suffix_bytes_iter? Please advise! I think we can switch all operations to from_unary to fix correctness and leave tuning performance to a follow-up.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am running some benchmarks to see what, if any, impact this has in performance

cargo bench --bench comparison_kernels -- like

I will report back shortly

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ilike_utf8 scalar complex                          1.00      2.7±0.05ms        ? ?/sec                            1.03      2.8±0.09ms        ? ?/sec
ilike_utf8 scalar contains                         1.01      4.2±0.05ms        ? ?/sec                            1.00      4.2±0.07ms        ? ?/sec
ilike_utf8 scalar ends with                        1.02  1279.8±38.11µs        ? ?/sec                            1.00  1258.4±37.50µs        ? ?/sec
ilike_utf8 scalar equals                           1.03   801.1±23.96µs        ? ?/sec                            1.00   775.4±20.05µs        ? ?/sec
ilike_utf8 scalar starts with                      1.00  1165.9±38.61µs        ? ?/sec                            1.02  1193.0±54.84µs        ? ?/sec
ilike_utf8_scalar_dyn dictionary[10] string[4])    1.00     77.7±0.09µs        ? ?/sec                            1.00     77.6±0.07µs        ? ?/sec
like_utf8 scalar complex                           1.00  1924.8±43.83µs        ? ?/sec                            1.00  1924.7±49.77µs        ? ?/sec
like_utf8 scalar contains                          1.00  1562.1±14.30µs        ? ?/sec                            1.00  1567.9±23.23µs        ? ?/sec
like_utf8 scalar ends with                         1.02   432.7±11.70µs        ? ?/sec                            1.00    425.2±5.10µs        ? ?/sec
like_utf8 scalar equals                            1.16    127.0±0.17µs        ? ?/sec                            1.00    109.6±0.20µs        ? ?/sec
like_utf8 scalar starts with                       1.00    339.2±4.42µs        ? ?/sec                            1.02   347.4±15.72µs        ? ?/sec
like_utf8_scalar_dyn dictionary[10] string[4])     1.00     77.6±0.20µs        ? ?/sec                            1.00     77.4±0.12µs        ? ?/sec
like_utf8view scalar complex                       1.00    182.8±0.58ms        ? ?/sec                            1.01    184.9±0.67ms        ? ?/sec
like_utf8view scalar contains                      1.02    141.3±0.34ms        ? ?/sec                            1.00    138.2±0.32ms        ? ?/sec
like_utf8view scalar ends with 13 bytes            1.49     66.4±0.42ms        ? ?/sec                            1.00     44.7±0.27ms        ? ?/sec
like_utf8view scalar ends with 4 bytes             1.45     66.1±0.33ms        ? ?/sec                            1.00     45.4±0.35ms        ? ?/sec
like_utf8view scalar ends with 6 bytes             1.46     66.0±0.35ms        ? ?/sec                            1.00     45.2±0.32ms        ? ?/sec
like_utf8view scalar equals                        1.00     33.9±0.12ms        ? ?/sec                            1.11     37.8±0.10ms        ? ?/sec
like_utf8view scalar starts with 13 bytes          1.55     68.6±0.36ms        ? ?/sec                            1.00     44.2±0.33ms        ? ?/sec
like_utf8view scalar starts with 4 bytes           1.57     42.6±0.18ms        ? ?/sec                            1.00     27.2±0.13ms        ? ?/sec
like_utf8view scalar starts with 6 bytes           1.53     69.0±0.32ms        ? ?/sec                            1.00     45.2±0.28ms        ? ?/sec
nilike_utf8 scalar complex                         1.00      2.7±0.07ms        ? ?/sec                            1.00      2.7±0.06ms        ? ?/sec
nilike_utf8 scalar contains                        1.02      4.2±0.05ms        ? ?/sec                            1.00      4.1±0.05ms        ? ?/sec
nilike_utf8 scalar ends with                       1.01  1259.7±41.60µs        ? ?/sec                            1.00  1247.7±26.01µs        ? ?/sec
nilike_utf8 scalar equals                          1.05   801.3±25.80µs        ? ?/sec                            1.00   765.8±17.79µs        ? ?/sec
nilike_utf8 scalar starts with                     1.02  1171.4±33.96µs        ? ?/sec                            1.00  1146.6±33.64µs        ? ?/sec
nlike_utf8 scalar complex                          1.00  1918.4±31.09µs        ? ?/sec                            1.00  1924.4±43.22µs        ? ?/sec
nlike_utf8 scalar contains                         1.00  1566.2±19.53µs        ? ?/sec                            1.00  1566.7±16.01µs        ? ?/sec
nlike_utf8 scalar ends with                        1.00    428.5±5.39µs        ? ?/sec                            1.02   438.5±14.26µs        ? ?/sec
nlike_utf8 scalar equals                           1.16    127.0±0.15µs        ? ?/sec                            1.00    109.5±0.12µs        ? ?/sec
nlike_utf8 scalar starts with                      1.07   359.3±21.26µs        ? ?/sec                            1.00    337.2±4.81µs        ? ?/sec

It looks like there are some non trivial regressions (50%) or utf8view with scalar starts/ends.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did run benchmarks locally too. Thanks @alamb for the instructions.

First, on the commit 1621350 where the PR branch starts

git checkout
16213501769371c7d2d6ce299755337b6db4d8fd
caffeinate -ds cargo bench --bench comparison_kernels -- --save-baseline main like

then on 1e82884, the tip of the PR branch

git checkout findepi/fix-string-view-like-checks-with-null-values-1121b8
caffeinate -ds cargo bench --bench comparison_kernels -- --baseline main like

This command runs bechmark and produces comparison. Let me omit low changes (below 9%) for brevity, as well as outliers reporting.

significant performance regressions:

like_utf8view scalar ends with 4 bytes
                        time:   [36.885 ms 36.925 ms 36.972 ms]
                        change: [+77.568% +77.857% +78.161%] (p = 0.00 < 0.05)
                        Performance has regressed.

like_utf8view scalar ends with 6 bytes
                        time:   [36.761 ms 36.818 ms 36.881 ms]
                        change: [+77.369% +77.776% +78.195%] (p = 0.00 < 0.05)
                        Performance has regressed.

like_utf8view scalar ends with 13 bytes
                        time:   [36.537 ms 36.613 ms 36.694 ms]
                        change: [+78.308% +79.019% +79.752%] (p = 0.00 < 0.05)
                        Performance has regressed.

like_utf8view scalar starts with 4 bytes
                        time:   [25.490 ms 25.542 ms 25.596 ms]
                        change: [+112.76% +113.25% +113.76%] (p = 0.00 < 0.05)
                        Performance has regressed.

like_utf8view scalar starts with 6 bytes
                        time:   [36.556 ms 36.598 ms 36.645 ms]
                        change: [+84.943% +85.578% +86.205%] (p = 0.00 < 0.05)
                        Performance has regressed.

like_utf8view scalar starts with 13 bytes
                        time:   [37.065 ms 37.105 ms 37.152 ms]
                        change: [+86.735% +87.424% +88.116%] (p = 0.00 < 0.05)
                        Performance has regressed.

significant performance improvements:

ilike_utf8 scalar equals
                        time:   [475.56 µs 476.17 µs 476.93 µs]
                        change: [-47.498% -47.406% -47.310%] (p = 0.00 < 0.05)
                        Performance has improved.

ilike_utf8 scalar contains
                        time:   [2.4461 ms 2.4500 ms 2.4552 ms]
                        change: [-15.221% -15.008% -14.786%] (p = 0.00 < 0.05)
                        Performance has improved.

ilike_utf8 scalar ends with
                        time:   [546.82 µs 547.76 µs 548.86 µs]
                        change: [-43.736% -43.605% -43.466%] (p = 0.00 < 0.05)
                        Performance has improved.

ilike_utf8 scalar starts with
                        time:   [560.40 µs 561.46 µs 562.67 µs]
                        change: [-43.349% -42.997% -42.641%] (p = 0.00 < 0.05)
                        Performance has improved.

Benchmarking ilike_utf8 scalar complex: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 5.4s, enable flat sampling, or reduce sample count to 60.
ilike_utf8 scalar complex
                        time:   [1.0629 ms 1.0637 ms 1.0649 ms]
                        change: [-29.556% -29.299% -29.062%] (p = 0.00 < 0.05)
                        Performance has improved.

nilike_utf8 scalar equals
                        time:   [475.34 µs 476.23 µs 477.67 µs]
                        change: [-46.418% -46.259% -46.081%] (p = 0.00 < 0.05)
                        Performance has improved.

nilike_utf8 scalar contains
                        time:   [2.4427 ms 2.4449 ms 2.4477 ms]
                        change: [-15.899% -15.421% -15.120%] (p = 0.00 < 0.05)
                        Performance has improved.

nilike_utf8 scalar ends with
                        time:   [548.31 µs 548.96 µs 549.63 µs]
                        change: [-42.924% -42.742% -42.554%] (p = 0.00 < 0.05)
                        Performance has improved.

nilike_utf8 scalar starts with
                        time:   [557.88 µs 558.54 µs 559.35 µs]
                        change: [-42.683% -42.392% -42.144%] (p = 0.00 < 0.05)
                        Performance has improved.

Benchmarking nilike_utf8 scalar complex: Warming up for 3.0000 s
Warning: Unable to complete 100 samples in 5.0s. You may wish to increase target time to 5.4s, enable flat sampling, or reduce sample count to 60.
nilike_utf8 scalar complex
                        time:   [1.0648 ms 1.0669 ms 1.0696 ms]
                        change: [-28.834% -28.600% -28.353%] (p = 0.00 < 0.05)
                        Performance has improved.

TL;DR
i must be doing something wrong here. the benchmarks report improvement for code path i didn't change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the results are apparently repeatable on my laptop

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, easy to fix, actually

Copy link
Member Author

@findepi findepi Nov 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pushed performance-oriented changes
i still observe a regression for one case

like_utf8view scalar starts with 4 bytes
                        time:   [12.899 ms 12.939 ms 12.986 ms]
                        change: [+7.7003% +8.0298% +8.4320%] (p = 0.00 < 0.05)
                        Performance has regressed.

and still see improvements for many ilike cases e.g.

nilike_utf8 scalar ends with
                        time:   [547.64 µs 548.55 µs 549.63 µs]
                        change: [-42.656% -42.257% -41.801%] (p = 0.00 < 0.05)
                        Performance has improved.

i don't think i know how to move it further from here. @alamb @tustvold ptal

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking it out

return None;
}
Some(finder.find(haystack).is_some() != negate)
})
.collect::<Vec<_>>(),
)
} else {
Expand All @@ -130,11 +137,16 @@ impl<'a> Predicate<'a> {
}
Predicate::StartsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
let nulls = string_view_array.logical_nulls();
BooleanArray::from(
string_view_array
.prefix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
.enumerate()
.map(|(idx, haystack)| {
if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() {
return None;
}
Some(equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate)
})
.collect::<Vec<_>>(),
)
Expand Down Expand Up @@ -166,11 +178,16 @@ impl<'a> Predicate<'a> {
}
Predicate::EndsWith(v) => {
if let Some(string_view_array) = array.as_any().downcast_ref::<StringViewArray>() {
let nulls = string_view_array.logical_nulls();
BooleanArray::from(
string_view_array
.suffix_bytes_iter(v.len())
.map(|haystack| {
equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate
.enumerate()
.map(|(idx, haystack)| {
if nulls.as_ref().map(|n| n.is_null(idx)).unwrap_or_default() {
return None;
}
Some(equals_bytes(haystack, v.as_bytes(), equals_kernel) != negate)
})
.collect::<Vec<_>>(),
)
Expand Down